From cf864a1c2ee5bba6e344a0d7aa11f1bbdb75f1ad Mon Sep 17 00:00:00 2001 From: yudrywet Date: Sun, 14 Apr 2024 20:11:34 +0800 Subject: [PATCH 01/56] chore: fix some typos in comments Signed-off-by: yudrywet --- milli/src/search/new/matches/mod.rs | 4 ++-- milli/src/search/new/query_term/parse_query.rs | 2 +- milli/src/update/facet/incremental.rs | 2 +- .../index_documents/extract/extract_fid_docid_facet_values.rs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2913f206d..8f0069589 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> { for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. + // we temporarily save the current token then we try to match the next one. Some(MatchType::Partial(partial)) => { potential_matches.push((token_position, word_position, partial.char_len())); partial @@ -722,7 +722,7 @@ mod tests { @"…void void void void void split the world void void" ); - // Text containing matches with diferent density. + // Text containing matches with different density. let text = "split void the void void world void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 86be7da77..93f5f081c 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens( if let Some(located_query_term) = phrase.build(ctx) { // as we are evaluating a negative operator we put the phrase // in the negative one *but* we don't reset the negative operator - // as we are immediatly starting a new negative phrase. + // as we are immediately starting a new negative phrase. if negative_phrase { negative_phrases.push(located_query_term); } else { diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 798e0fe3d..f871eee31 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner { ModificationResult::Expand | ModificationResult::Reduce { .. } ) { - // if any modification occured, insert it in the database. + // if any modification occurred, insert it in the database. self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; Ok(insertion_key_modification) } else { diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 1f8af372d..d88d96919 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -36,7 +36,7 @@ pub struct ExtractedFacetValues { /// Extracts the facet values of each faceted field of each document. /// -/// Returns the generated grenad reader containing the docid the fid and the orginal value as key +/// Returns the generated grenad reader containing the docid the fid and the original value as key /// and the normalized value as value extracted from the given chunk of documents. /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially. #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] From ab43a8a9497bf55fb958d9d01b800c2a2fa8362b Mon Sep 17 00:00:00 2001 From: writegr Date: Thu, 18 Apr 2024 14:12:52 +0800 Subject: [PATCH 02/56] chore: fix some typos in comments Signed-off-by: writegr --- filter-parser/src/lib.rs | 2 +- index-scheduler/src/batch.rs | 2 +- meilisearch-types/src/deserr/mod.rs | 2 +- meilitool/src/main.rs | 2 +- milli/src/documents/builder.rs | 2 +- milli/src/search/new/geo_sort.rs | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index fa5b70606..6bfbbb024 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -568,7 +568,7 @@ pub mod tests { insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#); insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#); insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#); - // but it also works with other sequencies + // but it also works with other sequences insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 3161dc499..bc9823a01 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -13,7 +13,7 @@ We can combine the two tasks in a single batch: 1. import documents X and Y Processing this batch is functionally equivalent to processing the two -tasks individally, but should be much faster since we are only performing +tasks individually, but should be much faster since we are only performing one indexing operation. */ diff --git a/meilisearch-types/src/deserr/mod.rs b/meilisearch-types/src/deserr/mod.rs index 537b24574..bf1aa1da5 100644 --- a/meilisearch-types/src/deserr/mod.rs +++ b/meilisearch-types/src/deserr/mod.rs @@ -26,7 +26,7 @@ pub type DeserrQueryParamError = DeserrError { pub msg: String, diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index bace7d16b..bfcbfdd6d 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -129,7 +129,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { } } - eprintln!("Sucessfully deleted {count} content files from disk!"); + eprintln!("Successfully deleted {count} content files from disk!"); Ok(()) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index e5124f67f..ec4d634aa 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { "string" => (field_name, AllowedType::String), "boolean" => (field_name, AllowedType::Boolean), "number" => (field_name, AllowedType::Number), - // if the pattern isn't reconized, we keep the whole field. + // if the pattern isn't recognized, we keep the whole field. _otherwise => (header, AllowedType::String), }, None => (header, AllowedType::String), diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index 5f5ceb379..4081c9637 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -42,7 +42,7 @@ fn facet_number_values<'a>( } /// Define the strategy used by the geo sort. -/// The paramater represents the cache size, and, in the case of the Dynamic strategy, +/// The parameter represents the cache size, and, in the case of the Dynamic strategy, /// the point where we move from using the iterative strategy to the rtree. #[derive(Debug, Clone, Copy)] pub enum Strategy { From 7f5ab3cef57091e2fa1280e10875cdf9c3f950de Mon Sep 17 00:00:00 2001 From: Simon Detheridge Date: Fri, 3 May 2024 12:29:31 +0100 Subject: [PATCH 03/56] Use http path pattern instead of full path in metrics --- meilisearch/src/middleware.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/middleware.rs b/meilisearch/src/middleware.rs index 6707bb6d5..f3e665c29 100644 --- a/meilisearch/src/middleware.rs +++ b/meilisearch/src/middleware.rs @@ -59,10 +59,12 @@ where let request_path = req.path(); let is_registered_resource = req.resource_map().has_resource(request_path); if is_registered_resource { + let request_pattern = req.match_pattern(); + let metric_path = request_pattern.as_ref().map_or(request_path, String::as_str); let request_method = req.method().to_string(); histogram_timer = Some( crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS - .with_label_values(&[&request_method, request_path]) + .with_label_values(&[&request_method, metric_path]) .start_timer(), ); } From 3698aef66bdf3b1bca2ad3b7a3362712e9dbd6b7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 May 2024 11:36:37 +0200 Subject: [PATCH 04/56] fix warning --- meilisearch/src/routes/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 7cf886017..a2fceb764 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -376,12 +376,6 @@ async fn get_version( }) } -#[derive(Serialize)] -struct KeysResponse { - private: Option, - public: Option, -} - pub async fn get_health( req: HttpRequest, index_scheduler: Data, From f33a1282f8987dde2a7cde3d69297c564952b765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 May 2024 10:31:39 +0200 Subject: [PATCH 05/56] Bump Rustls to v0.21.12 --- Cargo.lock | 160 ++++++++++++++++++++--------------------- meilisearch/Cargo.toml | 2 +- 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fad60e8da..2a8bdcbd3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,7 +80,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -424,7 +424,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -435,7 +435,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -552,7 +552,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -685,7 +685,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -845,9 +845,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.90" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" dependencies = [ "jobserver", "libc", @@ -992,7 +992,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1304,7 +1304,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1326,7 +1326,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core 0.20.3", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1356,7 +1356,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1460,7 +1460,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1678,7 +1678,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1698,7 +1698,7 @@ checksum = "03cdc46ec28bd728e67540c528013c6a10eb69a02eb31078a1bda695438cbfb8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1781,7 +1781,7 @@ dependencies = [ "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "uuid", ] @@ -1913,7 +1913,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -2117,9 +2117,9 @@ checksum = "36d244a08113319b5ebcabad2b8b7925732d15eec46d7e7ac3c11734f3b7a6ad" [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "js-sys", @@ -2413,7 +2413,7 @@ dependencies = [ "futures-util", "http 0.2.11", "hyper", - "rustls 0.21.10", + "rustls 0.21.12", "tokio", "tokio-rustls", ] @@ -3157,7 +3157,7 @@ checksum = "fc2fb41a9bb4257a3803154bdf7e2df7d45197d1941c9b1a90ad815231630721" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3213,9 +3213,9 @@ checksum = "e34f76eb3611940e0e7d53a9aaa4e6a3151f69541a282fd0dad5571420c53ff1" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -3258,7 +3258,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3342,7 +3342,7 @@ dependencies = [ "rayon", "regex", "reqwest", - "rustls 0.21.10", + "rustls 0.21.12", "rustls-pemfile", "segment", "serde", @@ -3598,7 +3598,7 @@ checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3942,7 +3942,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3996,7 +3996,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4025,7 +4025,7 @@ checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4133,9 +4133,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ "unicode-ident", ] @@ -4207,9 +4207,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -4391,7 +4391,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.10", + "rustls 0.21.12", "rustls-pemfile", "serde", "serde_json", @@ -4505,9 +4505,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.10" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", @@ -4517,9 +4517,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" dependencies = [ "log", "ring", @@ -4540,9 +4540,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" +checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" @@ -4667,7 +4667,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4941,7 +4941,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4963,9 +4963,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.58" +version = "2.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" dependencies = [ "proc-macro2", "quote", @@ -4989,7 +4989,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5100,7 +5100,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5243,7 +5243,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5252,7 +5252,7 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.10", + "rustls 0.21.12", "tokio", ] @@ -5354,7 +5354,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5537,7 +5537,7 @@ dependencies = [ "flate2", "log", "once_cell", - "rustls 0.22.2", + "rustls 0.22.3", "rustls-pki-types", "rustls-webpki 0.102.2", "serde", @@ -5703,7 +5703,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "wasm-bindgen-shared", ] @@ -5737,7 +5737,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5834,7 +5834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5843,7 +5843,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5870,7 +5870,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5905,17 +5905,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -5932,9 +5932,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" @@ -5950,9 +5950,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" @@ -5968,9 +5968,9 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" @@ -5986,9 +5986,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" @@ -6004,9 +6004,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" @@ -6022,9 +6022,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" @@ -6040,9 +6040,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" [[package]] name = "winnow" @@ -6140,7 +6140,7 @@ checksum = "9e6936f0cce458098a201c245a11bef556c6a0181129c7034d10d76d1ec3a2b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "synstructure", ] @@ -6161,7 +6161,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -6181,7 +6181,7 @@ checksum = "e6a647510471d372f2e6c2e6b7219e44d8c574d24fdc11c610a61455782f18c3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "synstructure", ] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 4a2b11b21..ed62c5f48 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -75,7 +75,7 @@ reqwest = { version = "0.11.23", features = [ "rustls-tls", "json", ], default-features = false } -rustls = "0.21.6" +rustls = "0.21.12" rustls-pemfile = "1.0.2" segment = { version = "0.2.3", optional = true } serde = { version = "1.0.195", features = ["derive"] } From ac4bc143c4dff49d6a5d2fb9730a7202c001b5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 May 2024 10:39:38 +0200 Subject: [PATCH 06/56] Bump ureq to v2.9.7 --- Cargo.lock | 34 ++++++++++++++++++++-------------- index-scheduler/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a8bdcbd3..937fce64a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -486,6 +486,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64ct" version = "1.6.0" @@ -4517,9 +4523,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", "ring", @@ -4643,9 +4649,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.197" +version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc" dependencies = [ "serde_derive", ] @@ -4661,9 +4667,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9" dependencies = [ "proc-macro2", "quote", @@ -4672,9 +4678,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "indexmap", "itoa", @@ -5469,9 +5475,9 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-blocks" @@ -5529,15 +5535,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.9.6" +version = "2.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" +checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "flate2", "log", "once_cell", - "rustls 0.22.3", + "rustls 0.22.4", "rustls-pki-types", "rustls-webpki 0.102.2", "serde", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index c758f1114..4b6c0a36d 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -37,7 +37,7 @@ time = { version = "0.3.31", features = [ "macros", ] } tracing = "0.1.40" -ureq = "2.9.1" +ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 082cd0812..7d903178b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -85,7 +85,7 @@ liquid = "0.26.4" arroy = "0.2.0" rand = "0.8.5" tracing = "0.1.40" -ureq = { version = "2.9.6", features = ["json"] } +ureq = { version = "2.9.7", features = ["json"] } url = "2.5.0" [dev-dependencies] From 2a0ece814cc904a828bae325cd1977c6659bdc04 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 7 May 2024 12:23:36 +0200 Subject: [PATCH 07/56] Add precommands to workloads --- xtask/src/bench/workload.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xtask/src/bench/workload.rs b/xtask/src/bench/workload.rs index d82c5ad19..db44b5a8f 100644 --- a/xtask/src/bench/workload.rs +++ b/xtask/src/bench/workload.rs @@ -22,6 +22,8 @@ pub struct Workload { pub run_count: u16, pub extra_cli_args: Vec, pub assets: BTreeMap, + #[serde(default)] + pub precommands: Vec, pub commands: Vec, } @@ -37,6 +39,15 @@ async fn run_commands( let report_folder = &args.report_folder; let workload_name = &workload.name; + for batch in workload + .precommands + .as_slice() + .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) + { + super::command::run_batch(meili_client, batch, &workload.assets, &args.asset_folder) + .await?; + } + std::fs::create_dir_all(report_folder) .with_context(|| format!("could not create report directory at {report_folder}"))?; From 43763eb98ac6e9e6630fb4b5f34c21f9539a6ac2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 7 May 2024 12:25:57 +0200 Subject: [PATCH 08/56] Document precommands --- BENCHMARKS.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index e588b1b5b..e1d0c5feb 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -187,8 +187,8 @@ They are JSON files with the following structure (comments are not actually supp }, // Core of the workload. // A list of commands to run sequentially. - // A command is a request to the Meilisearch instance that is executed while the profiling runs. - "commands": [ + // Optional: A precommand is a request to the Meilisearch instance that is executed before the profiling runs. + "precommands": [ { // Meilisearch route to call. `http://localhost:7700/` will be prepended. "route": "indexes/movies/settings", @@ -224,8 +224,11 @@ They are JSON files with the following structure (comments are not actually supp // - DontWait: run the next command without waiting the response to this one. // - WaitForResponse: run the next command as soon as the response from the server is received. // - WaitForTask: run the next command once **all** the Meilisearch tasks created up to now have finished processing. - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + // A command is a request to the Meilisearch instance that is executed while the profiling runs. + "commands": [ { "route": "indexes/movies/documents", "method": "POST", From 9d3ff11b21c896433f11a0ae9e2837095bbcabfe Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 7 May 2024 14:03:14 +0200 Subject: [PATCH 09/56] Modify existing workload files to use precommands --- workloads/hackernews.json | 8 +++--- workloads/movies-nothreads.json | 8 +++--- workloads/movies-subset-hf-embeddings.json | 6 +++-- workloads/settings-add-embeddings.json | 6 +++-- workloads/settings-add-remove-filters.json | 6 +++-- workloads/settings-proximity-precision.json | 6 +++-- .../settings-remove-add-swap-searchable.json | 6 +++-- workloads/settings-typo.json | 27 ++++++++++++++++--- 8 files changed, 53 insertions(+), 20 deletions(-) diff --git a/workloads/hackernews.json b/workloads/hackernews.json index 0a99b69ff..5762a7309 100644 --- a/workloads/hackernews.json +++ b/workloads/hackernews.json @@ -54,7 +54,7 @@ "sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe" } }, - "commands": [ + "precommands": [ { "route": "indexes/movies/settings", "method": "PATCH", @@ -78,8 +78,10 @@ ] } }, - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/movies-nothreads.json b/workloads/movies-nothreads.json index 175daacf9..d3dd006fa 100644 --- a/workloads/movies-nothreads.json +++ b/workloads/movies-nothreads.json @@ -11,7 +11,7 @@ "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" } }, - "commands": [ + "precommands": [ { "route": "indexes/movies/settings", "method": "PATCH", @@ -30,8 +30,10 @@ ] } }, - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/movies-subset-hf-embeddings.json index d24bc752c..d7672cf73 100644 --- a/workloads/movies-subset-hf-embeddings.json +++ b/workloads/movies-subset-hf-embeddings.json @@ -11,7 +11,7 @@ "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" } }, - "commands": [ + "precommands": [ { "route": "experimental-features", "method": "PATCH", @@ -55,7 +55,9 @@ } }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/settings-add-embeddings.json b/workloads/settings-add-embeddings.json index f87286943..6ad50769a 100644 --- a/workloads/settings-add-embeddings.json +++ b/workloads/settings-add-embeddings.json @@ -11,7 +11,7 @@ "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" } }, - "commands": [ + "precommands": [ { "route": "experimental-features", "method": "PATCH", @@ -49,7 +49,9 @@ "asset": "movies-100.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/settings-add-remove-filters.json b/workloads/settings-add-remove-filters.json index 12493a8fc..f017ed960 100644 --- a/workloads/settings-add-remove-filters.json +++ b/workloads/settings-add-remove-filters.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -59,7 +59,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-proximity-precision.json b/workloads/settings-proximity-precision.json index 384f99e37..ac6d98da0 100644 --- a/workloads/settings-proximity-precision.json +++ b/workloads/settings-proximity-precision.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -61,7 +61,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-remove-add-swap-searchable.json b/workloads/settings-remove-add-swap-searchable.json index 61db8822e..7f70d1ce8 100644 --- a/workloads/settings-remove-add-swap-searchable.json +++ b/workloads/settings-remove-add-swap-searchable.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -61,7 +61,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-typo.json b/workloads/settings-typo.json index 45163bc98..e04135877 100644 --- a/workloads/settings-typo.json +++ b/workloads/settings-typo.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -62,14 +62,18 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", "body": { "inline": { "typoTolerance": { - "disableOnAttributes": ["featured_job_organization_name"] + "disableOnAttributes": [ + "featured_job_organization_name" + ] } } }, @@ -93,7 +97,22 @@ "body": { "inline": { "typoTolerance": { - "disableOnWords": ["Ben","Elowitz","Kevin","Flaherty", "Ron", "Dustin", "Owen", "Chris", "Mark", "Matt", "Peter", "Van", "Head", "of"] + "disableOnWords": [ + "Ben", + "Elowitz", + "Kevin", + "Flaherty", + "Ron", + "Dustin", + "Owen", + "Chris", + "Mark", + "Matt", + "Peter", + "Van", + "Head", + "of" + ] } } }, From c22460045c8b0d5a830caeae5c3b1856ccfa7b90 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 May 2024 14:49:45 +0200 Subject: [PATCH 10/56] Stops returning an option in the internal searchable fields --- milli/src/fieldids_weights_map.rs | 28 +++++ milli/src/index.rs | 108 ++++++++++++------ milli/src/lib.rs | 3 + milli/src/search/new/db_cache.rs | 12 +- milli/src/search/new/exact_attribute.rs | 8 +- milli/src/search/new/mod.rs | 17 +-- .../search/new/ranking_rule_graph/fid/mod.rs | 12 +- .../extract/extract_docid_word_positions.rs | 6 +- milli/src/update/settings.rs | 16 +-- 9 files changed, 120 insertions(+), 90 deletions(-) create mode 100644 milli/src/fieldids_weights_map.rs diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs new file mode 100644 index 000000000..255f6ab80 --- /dev/null +++ b/milli/src/fieldids_weights_map.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{FieldId, Weight}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct FieldidsWeightsMap { + map: HashMap, +} + +impl FieldidsWeightsMap { + pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { + self.map.insert(fid, weight) + } + + pub fn remove(&mut self, fid: FieldId) -> Option { + self.map.remove(&fid) + } + + pub fn weight(&self, fid: FieldId) -> Option { + self.map.get(&fid).copied() + } + + pub fn max_weight(&self) -> Option { + self.map.values().copied().max() + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..b6b07404b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -25,8 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, + GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, + BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -42,6 +44,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -414,6 +417,32 @@ impl Index { .unwrap_or_default()) } + /* fieldids weights map */ + // This maps the fields ids to their weights. + // Their weights is defined by the ordering of the searchable attributes. + + /// Writes the fieldids weights map which associates the field ids to their weights + pub(crate) fn put_fieldids_weights_map( + &self, + wtxn: &mut RwTxn, + map: &FieldidsWeightsMap, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::FIELDIDS_WEIGHTS_MAP_KEY, + map, + ) + } + + /// Get the fieldids weights map which associates the field ids to their weights + pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .remap_types::>() + .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? + .unwrap_or_default()) + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -578,10 +607,12 @@ impl Index { wtxn: &mut RwTxn, user_fields: &[&str], fields_ids_map: &FieldsIdsMap, - ) -> heed::Result<()> { + ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; + let mut weights = self.fieldids_weights_map(&wtxn)?; + // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. @@ -589,17 +620,23 @@ impl Index { // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. let mut real_fields = user_fields.to_vec(); - for field_from_map in fields_ids_map.names() { - for user_field in user_fields { + for (id, field_from_map) in fields_ids_map.iter() { + for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !user_fields.contains(&field_from_map) { real_fields.push(field_from_map); + + let weight: u16 = + weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; + weights.insert(id, weight as u16); } } } - self.put_searchable_fields(wtxn, &real_fields) + self.put_searchable_fields(wtxn, &real_fields)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { @@ -623,28 +660,31 @@ impl Index { } /// Returns the searchable fields, those are the fields that are indexed, - /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { self.main .remap_types::>>() - .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? + .map(|fields| Ok(fields.into_iter().map(|field| Cow::Borrowed(field)).collect())) + .unwrap_or_else(|| { + Ok(self + .fields_ids_map(rtxn)? + .names() + .map(|field| Cow::Owned(field.to_string())) + .collect()) + }) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { - match self.searchable_fields(rtxn)? { - Some(fields) => { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = Vec::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(name) { - fields_ids.push(field_id); - } - } - Ok(Some(fields_ids)) + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.searchable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.push(field_id); } - None => Ok(None), } + Ok(fields_ids) } /// Writes the searchable fields, when this list is specified, only these are indexed. @@ -1710,10 +1750,14 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, field_distribution, 1); + db_snap!(index, field_distribution, @r###" + age 1 | + id 2 | + name 2 | + "###); db_snap!(index, word_docids, - @r###" + @r###" 1 [0, ] 2 [1, ] 20 [1, ] @@ -1722,18 +1766,6 @@ pub(crate) mod tests { "### ); - db_snap!(index, field_distribution); - - db_snap!(index, field_distribution, - @r###" - age 1 | - id 2 | - name 2 | - "### - ); - - // snapshot_index!(&index, "1", include: "^field_distribution$"); - // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1820,7 +1852,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -1840,7 +1872,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -1856,7 +1888,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a1e240464..881633b5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,6 +28,7 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +mod fieldids_weights_map; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -52,6 +53,7 @@ pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, @@ -77,6 +79,7 @@ pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Weight = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 62c921a1d..a99000f60 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> { .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, @@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 7932f0c2a..41b70ae39 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -184,13 +184,7 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = { - if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { - fids - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().collect() - } - }; + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 617068ef8..acbb3638b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,27 +96,22 @@ impl<'ctx> SearchContext<'ctx> { contains_wildcard = true; continue; } - let searchable_contains_name = - searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); + let searchable_contains_name = searchable_names.iter().any(|name| name == field_name); let fid = match (fids_map.id(field_name), searchable_contains_name) { // The Field id exist and the field is searchable - (Some(fid), Some(true)) | (Some(fid), None) => fid, + (Some(fid), true) => fid, // The field is searchable but the Field id doesn't exist => Internal Error - (None, Some(true)) => { + (None, true) => { return Err(FieldIdMapMissingEntry::FieldName { field_name: field_name.to_string(), process: "search", } .into()) } - // The field is not searchable, but the searchableAttributes are set to * => ignore field - (None, None) => continue, // The field is not searchable => User error - (_fid, Some(false)) => { - let (valid_fields, hidden_fields) = match searchable_names { - Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, - None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, - }; + (_fid, false) => { + let (valid_fields, hidden_fields) = + self.index.remove_hidden_fields(self.txn, searchable_names)?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 8f3e0cc82..cf65249de 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -77,17 +77,7 @@ impl RankingRuleGraphTrait for FidGraph { } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = { - if let Some(max_fid) = ctx - .index - .searchable_fields_ids(ctx.txn)? - .map(|field_ids| field_ids.into_iter().max()) - { - max_fid - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().max() - } - }; + let max_fid: Option = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); if let Some(max_fid) = max_fid { if !all_fields.contains(&max_fid) { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6af5bba6d..d97b6639e 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -186,7 +186,7 @@ fn searchable_fields_changed( ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.contains(&field_id) { let del_add = KvReaderDelAdd::new(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -298,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &[FieldId], tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, @@ -309,7 +309,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.as_ref().contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1997e966e..c0742a74a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -468,14 +468,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else let old_fields = self.index.searchable_fields(self.wtxn)?; - let did_change = match old_fields { - // If old_fields is Some, let's check to see if the fields actually changed - Some(old_fields) => { - let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields - } - // If old_fields is None, the fields have changed (because they are being set) - None => true, + let did_change = { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields }; if !did_change { return Ok(false); @@ -1172,7 +1167,7 @@ pub(crate) struct InnerIndexSettings { pub user_defined_faceted_fields: HashSet, pub user_defined_searchable_fields: Option>, pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Option>, + pub searchable_fields_ids: Vec, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, @@ -1517,6 +1512,7 @@ mod tests { use big_s::S; use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; + use meili_snap::snapshot; use super::*; use crate::error::Error; @@ -1576,7 +1572,7 @@ mod tests { // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(searchable_fields, None); + snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); From 4e4a1ddff7807c1268579bda54c53cd6fca29547 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 May 2024 16:37:34 +0200 Subject: [PATCH 11/56] gate a test behind the required feature --- milli/src/update/index_documents/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bb180a7ee..936ce1efc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3260,6 +3260,7 @@ mod tests { } #[test] + #[cfg(feature = "all-tokenizations")] fn stored_detected_script_and_language_should_not_return_deleted_documents() { use charabia::{Language, Script}; let index = TempIndex::new(); From 685f452fb2524c9c3f67218fb2dd273d59ba5110 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 May 2024 17:56:40 +0200 Subject: [PATCH 12/56] Fix the indexing of the searchable --- milli/examples/search.rs | 2 +- milli/src/fieldids_weights_map.rs | 4 + milli/src/index.rs | 85 ++++++++++- milli/src/search/mod.rs | 4 +- milli/src/search/new/db_cache.rs | 140 ++++++------------ .../src/search/new/matches/matching_words.rs | 2 +- milli/src/search/new/matches/mod.rs | 2 +- milli/src/search/new/mod.rs | 83 ++++++----- .../src/search/new/query_term/parse_query.rs | 2 +- milli/src/search/new/tests/attribute_fid.rs | 15 +- milli/src/snapshot_tests.rs | 25 ++++ milli/src/update/settings.rs | 25 ++-- 12 files changed, 235 insertions(+), 154 deletions(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 8640acf42..3d10ec599 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -48,7 +48,7 @@ fn main() -> Result<(), Box> { let start = Instant::now(); - let mut ctx = SearchContext::new(&index, &txn); + let mut ctx = SearchContext::new(&index, &txn)?; let universe = filtered_universe(&ctx, &None)?; let docs = execute_search( diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 255f6ab80..bead160e9 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -25,4 +25,8 @@ impl FieldidsWeightsMap { pub fn max_weight(&self) -> Option { self.map.values().copied().max() } + + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + self.map.keys().copied() + } } diff --git a/milli/src/index.rs b/milli/src/index.rs index b6b07404b..e9f0f75de 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,7 +28,7 @@ use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, - BEU16, BEU32, BEU64, + Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -443,6 +443,27 @@ impl Index { .unwrap_or_default()) } + pub fn searchable_fields_and_weights<'a>( + &self, + rtxn: &'a RoTxn, + ) -> heed::Result, FieldId, Weight)>> { + let fid_map = self.fields_ids_map(rtxn)?; + let weight_map = self.fieldids_weights_map(rtxn)?; + let searchable = self.searchable_fields(rtxn)?; + + Ok(searchable + .into_iter() + .map(|field| { + // the searchable attributes are a subset of the field id map + let fid = fid_map.id(&field).unwrap(); + // all the searchable fields have a weight + let weight = weight_map.weight(fid).unwrap(); + + (field, fid, weight) + }) + .collect()) + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -605,9 +626,25 @@ impl Index { pub(crate) fn put_all_searchable_fields_from_fields_ids_map( &self, wtxn: &mut RwTxn, - user_fields: &[&str], + user_fields: Option<&[&str]>, fields_ids_map: &FieldsIdsMap, ) -> Result<()> { + // Special case if there is no user defined fields. + // Then the whole field id map is marked as searchable. + if user_fields.is_none() { + let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut searchable = Vec::new(); + for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { + searchable.push(name); + weights.insert(fid, weight as u16); + } + self.put_searchable_fields(wtxn, &searchable)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + return Ok(()); + } + + let user_fields = user_fields.unwrap(); + // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -617,13 +654,13 @@ impl Index { // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. - let mut real_fields = user_fields.to_vec(); + // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. + let mut real_fields = Vec::new(); for (id, field_from_map) in fields_ids_map.iter() { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) - && !user_fields.contains(&field_from_map) + && !real_fields.contains(&field_from_map) { real_fields.push(field_from_map); @@ -2427,6 +2464,14 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); index .add_documents(documents!([ @@ -2442,6 +2487,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); index.delete_documents(Default::default()); @@ -2452,6 +2507,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); index .add_documents(documents!([ @@ -2467,6 +2532,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bab67e6bd..7427db3a1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -147,7 +147,7 @@ impl<'a> Search<'a> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { - let ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn)?; filtered_universe(&ctx, &self.filter) } else { Ok(self.execute()?.candidates) @@ -155,7 +155,7 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { ctx.searchable_attributes(searchable_attributes)?; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index a99000f60..4985f55e9 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -159,58 +159,36 @@ impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids(&mut self, word: Interned) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.word_docids, - self.index.word_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } fn get_db_exact_word_docids( &mut self, word: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.exact_word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.exact_word_docids, - self.index.exact_word_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.exact_word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { @@ -238,58 +216,36 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } fn get_db_exact_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.exact_word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.exact_word_prefix_docids, - self.index.exact_word_prefix_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.exact_word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } pub fn get_db_word_pair_proximity_docids( @@ -465,8 +421,8 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + // if the requested fid isn't in the list of searchable, return None. + if !self.searchable_fids.contains(&fid) { return Ok(None); } @@ -484,8 +440,8 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + // if the requested fid isn't in the searchable list, return None. + if !self.searchable_fids.contains(&fid) { return Ok(None); } diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 56bf6c169..4db1c99c6 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -258,7 +258,7 @@ pub(crate) mod tests { fn matching_words() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8f0069589..40e6f8dc8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -506,7 +506,7 @@ mod tests { impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { - let mut ctx = SearchContext::new(index, rtxn); + let mut ctx = SearchContext::new(index, rtxn).unwrap(); let universe = filtered_universe(&ctx, &None).unwrap(); let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( &mut ctx, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index acbb3638b..90d971fa3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; -use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, - UserError, + UserError, Weight, }; /// A structure used throughout the execution of a search query. @@ -67,12 +66,25 @@ pub struct SearchContext<'ctx> { pub phrase_interner: DedupInterner, pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, - pub restricted_fids: Option, + pub searchable_fids: SearchableFids, } impl<'ctx> SearchContext<'ctx> { - pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { - Self { + pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result { + let searchable_fids = index.searchable_fields_and_weights(txn)?; + let exact_attributes_ids = index.exact_attributes_ids(txn)?; + + let mut exact = Vec::new(); + let mut tolerant = Vec::new(); + for (name, fid, weight) in searchable_fids { + if exact_attributes_ids.contains(&fid) { + exact.push((fid, weight)); + } else { + tolerant.push((fid, weight)); + } + } + + Ok(Self { index, txn, db_cache: <_>::default(), @@ -80,38 +92,32 @@ impl<'ctx> SearchContext<'ctx> { phrase_interner: <_>::default(), term_interner: <_>::default(), phrase_docids: <_>::default(), - restricted_fids: None, - } + searchable_fids: SearchableFids { tolerant, exact }, + }) } - pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { + // TODO: TAMO continue here + pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { + if attributes_to_search_on.contains(&String::from("*")) { + return Ok(()); + } + let fids_map = self.index.fields_ids_map(self.txn)?; - let searchable_names = self.index.searchable_fields(self.txn)?; + let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; - let mut restricted_fids = RestrictedFids::default(); - let mut contains_wildcard = false; - for field_name in searchable_attributes { - if field_name == "*" { - contains_wildcard = true; - continue; - } - let searchable_contains_name = searchable_names.iter().any(|name| name == field_name); - let fid = match (fids_map.id(field_name), searchable_contains_name) { + let mut restricted_fids = SearchableFids::default(); + for field_name in attributes_to_search_on { + let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); + let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable - (Some(fid), true) => fid, - // The field is searchable but the Field id doesn't exist => Internal Error - (None, true) => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: field_name.to_string(), - process: "search", - } - .into()) - } + Some((_name, fid, weight)) => (*fid, *weight), // The field is not searchable => User error - (_fid, false) => { - let (valid_fields, hidden_fields) = - self.index.remove_hidden_fields(self.txn, searchable_names)?; + None => { + let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( + self.txn, + searchable_names.iter().map(|(name, _, _)| name), + )?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { @@ -124,13 +130,13 @@ impl<'ctx> SearchContext<'ctx> { }; if exact_attributes_ids.contains(&fid) { - restricted_fids.exact.push(fid); + restricted_fids.exact.push((fid, weight)); } else { - restricted_fids.tolerant.push(fid); + restricted_fids.tolerant.push((fid, weight)); }; } - self.restricted_fids = (!contains_wildcard).then_some(restricted_fids); + self.searchable_fids = restricted_fids; Ok(()) } @@ -152,14 +158,15 @@ impl Word { } #[derive(Debug, Clone, Default)] -pub struct RestrictedFids { - pub tolerant: Vec, - pub exact: Vec, +pub struct SearchableFids { + pub tolerant: Vec<(FieldId, Weight)>, + pub exact: Vec<(FieldId, Weight)>, } -impl RestrictedFids { +impl SearchableFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.contains(fid) || self.exact.contains(fid) + self.tolerant.iter().find(|(id, _)| id == fid).is_some() + || self.exact.iter().find(|(id, _)| id == fid).is_some() } } diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 93f5f081c..74b2ed564 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -366,7 +366,7 @@ mod tests { let tokens = tokenizer.tokenize("."); let index = temp_index_with_documents(); let rtxn = index.read_txn()?; - let mut ctx = SearchContext::new(&index, &rtxn); + let mut ctx = SearchContext::new(&index, &rtxn)?; // panics with `attempt to add with overflow` before let ExtractedTokens { query_terms, .. } = located_query_terms_from_tokens(&mut ctx, tokens, None)?; diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 38225404c..61b0a743b 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -1,5 +1,5 @@ use crate::index::tests::TempIndex; -use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -131,6 +131,19 @@ fn test_attribute_fid_simple() { #[test] fn test_attribute_fid_ngrams() { let index = create_index(); + db_snap!(index, fields_ids_map, @r###" + 0 title | + 1 description | + 2 plot | + 3 id | + "###); + db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + 2 2 | + "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 28c4cb45c..d79003747 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String { } snap } +pub fn snap_fieldids_weights_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let weights_map = index.fieldids_weights_map(&rtxn).unwrap(); + + let mut snap = String::new(); + writeln!(&mut snap, "fid weight").unwrap(); + let mut field_ids: Vec<_> = weights_map.ids().collect(); + field_ids.sort(); + for field_id in field_ids { + let weight = weights_map.weight(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap(); + } + snap +} +pub fn snap_searchable_fields(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + format!("{searchable_fields:?}") +} pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); @@ -469,6 +488,12 @@ macro_rules! full_snap_of_db { ($index:ident, fields_ids_map) => {{ $crate::snapshot_tests::snap_fields_ids_map(&$index) }}; + ($index:ident, fieldids_weights_map) => {{ + $crate::snapshot_tests::snap_fieldids_weights_map(&$index) + }}; + ($index:ident, searchable_fields) => {{ + $crate::snapshot_tests::snap_searchable_fields(&$index) + }}; ($index:ident, geo_faceted_documents_ids) => {{ $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) }}; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c0742a74a..19b2c5778 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -496,7 +496,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, - &names, + Some(&names), &new_fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; @@ -1228,18 +1228,19 @@ impl InnerIndexSettings { // find and insert the new field ids pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + let searchable_fields = self + .user_defined_searchable_fields + .as_ref() + .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); + // in case new fields were introduced we're going to recreate the searchable fields. - if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { - let searchable_fields = - searchable_fields.iter().map(String::as_ref).collect::>(); - index.put_all_searchable_fields_from_fields_ids_map( - wtxn, - &searchable_fields, - &self.fields_ids_map, - )?; - let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - self.searchable_fields_ids = searchable_fields_ids; - } + index.put_all_searchable_fields_from_fields_ids_map( + wtxn, + searchable_fields.as_deref(), + &self.fields_ids_map, + )?; + let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + self.searchable_fields_ids = searchable_fields_ids; Ok(()) } From 9ecde418531e199a9c558b4273e63e58649ff35d Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 May 2024 16:18:05 +0200 Subject: [PATCH 13/56] add a test on the current behaviour --- milli/src/index.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index e9f0f75de..c66222ab1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2627,4 +2627,52 @@ pub(crate) mod tests { db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted } + + #[test] + fn swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + index + .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("realName")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 realName | + 2 id | + 3 age | + "###); + db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); + } } From b0afe0972e109bdaaa532ef5f125e02f83930ab0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 May 2024 16:49:08 +0200 Subject: [PATCH 14/56] stop updating the fields ids map when fields are only swapped --- milli/src/index.rs | 9 +++++---- milli/src/update/settings.rs | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c66222ab1..d0d148d86 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2662,17 +2662,18 @@ pub(crate) mod tests { settings.set_filterable_fields(HashSet::from([S("age")])); }) .unwrap(); + // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" 0 name | - 1 realName | - 2 id | - 3 age | + 1 id | + 2 age | + 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 3 1 | "###); } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 19b2c5778..6875e6f47 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,6 +12,7 @@ use time::OffsetDateTime; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; +use crate::documents::FieldIdMapper; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::order_by_map::OrderByMap; @@ -461,8 +462,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(true) } - /// Updates the index's searchable attributes. This causes the field map to be recomputed to - /// reflect the order of the searchable attributes. + /// Updates the index's searchable attributes. fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { @@ -480,17 +480,20 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut new_fields_ids_map = FieldsIdsMap::new(); - // fields are deduplicated, only the first occurrence is taken into account - let names = fields.iter().unique().map(String::as_str).collect::>(); + // Since we're updating the settings we can only add new fields at the end of the field id map + let mut new_fields_ids_map = old_fields_ids_map.clone(); + let names = fields + .iter() + // fields are deduplicated, only the first occurrence is taken into account + .unique() + .map(String::as_str) + .collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - for (_, name) in old_fields_ids_map.iter() { + // The fields ids map won't change the field id of already present elements thus only the + // new fields will be inserted. new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } From a0082c4df9f3cc5497678d4d6989dbba8674f31c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 10:45:06 +0200 Subject: [PATCH 15/56] add a failing test on the attribute ranking rule --- milli/src/index.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index d0d148d86..accfff719 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2662,6 +2662,7 @@ pub(crate) mod tests { settings.set_filterable_fields(HashSet::from([S("age")])); }) .unwrap(); + // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" 0 name | @@ -2676,4 +2677,54 @@ pub(crate) mod tests { 3 1 | "###); } + + #[test] + fn attribute_weights_after_swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("beverage")]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "name": "kefir", "beverage": "water" }, + { "id": 1, "name": "tamo", "beverage": "kefir" } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find kefir the dog first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("beverage"), S("name")]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find tamo first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + } } From caa6a7149ac6967580c6e17a4a62f06f64f8312a Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 16:56:08 +0200 Subject: [PATCH 16/56] make the attribute ranking rule use the weights and fix the tests --- meilisearch/src/search_queue.rs | 3 + milli/src/index.rs | 2 +- milli/src/search/new/mod.rs | 29 ++- .../search/new/ranking_rule_graph/fid/mod.rs | 38 +-- milli/src/search/new/tests/attribute_fid.rs | 14 +- ...attribute_fid__attribute_fid_ngrams-4.snap | 244 ++++++++++++++++++ milli/src/update/settings.rs | 12 +- 7 files changed, 306 insertions(+), 36 deletions(-) create mode 100644 milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 6d5044d20..0fe9a5a53 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,6 +85,9 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { + if search_request.is_none() { + continue; + } // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web let search_request = search_request.unwrap(); if searches_running < usize::from(parallelism) && queue.is_empty() { diff --git a/milli/src/index.rs b/milli/src/index.rs index accfff719..49f78f3cd 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2722,8 +2722,8 @@ pub(crate) mod tests { // We should find tamo first insta::assert_debug_snapshot!(results.documents_ids, @r###" [ - 0, 1, + 0, ] "###); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 90d971fa3..9a2ff5b02 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -76,7 +76,7 @@ impl<'ctx> SearchContext<'ctx> { let mut exact = Vec::new(); let mut tolerant = Vec::new(); - for (name, fid, weight) in searchable_fids { + for (_name, fid, weight) in searchable_fids { if exact_attributes_ids.contains(&fid) { exact.push((fid, weight)); } else { @@ -96,22 +96,26 @@ impl<'ctx> SearchContext<'ctx> { }) } - // TODO: TAMO continue here pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { - if attributes_to_search_on.contains(&String::from("*")) { - return Ok(()); - } - - let fids_map = self.index.fields_ids_map(self.txn)?; + let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; + let mut wildcard = false; + let mut restricted_fids = SearchableFids::default(); for field_name in attributes_to_search_on { + if field_name == "*" { + wildcard = true; + // we cannot early exit as we want to returns error in case of unknown fields + continue; + } let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable Some((_name, fid, weight)) => (*fid, *weight), + // The field is not searchable but the user didn't define any searchable attributes + None if user_defined_searchable.is_none() => continue, // The field is not searchable => User error None => { let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( @@ -136,7 +140,16 @@ impl<'ctx> SearchContext<'ctx> { }; } - self.searchable_fids = restricted_fids; + if wildcard { + let (exact, tolerant) = searchable_names + .iter() + .map(|(_name, fid, weight)| (*fid, *weight)) + .partition(|(fid, _weight)| exact_attributes_ids.contains(fid)); + + self.searchable_fids = SearchableFids { tolerant, exact }; + } else { + self.searchable_fids = restricted_fids; + } Ok(()) } diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index cf65249de..e10f2fbab 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::Result; +use crate::{FieldId, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { term: LocatedQueryTermSubset, - fid: u16, + fid: Option, } pub enum FidGraph {} @@ -26,13 +26,16 @@ impl RankingRuleGraphTrait for FidGraph { universe: &RoaringBitmap, ) -> Result { let FidCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = compute_query_term_subset_docids_within_field_id( - ctx, - &term.term_subset, - condition.fid, - )?; - docids &= universe; + + let docids = if let Some(fid) = condition.fid { + // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument + let mut docids = + compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; + docids &= universe; + docids + } else { + RoaringBitmap::new() + }; Ok(ComputedCondition { docids, @@ -68,24 +71,27 @@ impl RankingRuleGraphTrait for FidGraph { all_fields.extend(fields); } + let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?; + let mut edges = vec![]; for fid in all_fields.iter().copied() { + let weight = weights_map.weight(fid).unwrap(); edges.push(( - fid as u32 * term.term_ids.len() as u32, - conditions_interner.insert(FidCondition { term: term.clone(), fid }), + weight as u32 * term.term_ids.len() as u32, + conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), )); } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); + let max_weight: Option = weights_map.max_weight(); - if let Some(max_fid) = max_fid { - if !all_fields.contains(&max_fid) { + if let Some(max_weight) = max_weight { + if !all_fields.contains(&max_weight) { edges.push(( - max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { term: term.clone(), // TODO remove this ugly clone - fid: max_fid, + fid: None, }), )); } diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 61b0a743b..c595887ba 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -132,17 +132,17 @@ fn test_attribute_fid_simple() { fn test_attribute_fid_ngrams() { let index = create_index(); db_snap!(index, fields_ids_map, @r###" - 0 title | - 1 description | - 2 plot | - 3 id | + 0 id | + 1 title | + 2 description | + 3 plot | "###); db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | - 1 1 | - 2 2 | + 1 0 | + 2 1 | + 3 2 | "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap new file mode 100644 index 000000000..930a21626 --- /dev/null +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap @@ -0,0 +1,244 @@ +--- +source: milli/src/search/new/tests/attribute_fid.rs +expression: "format!(\"{document_ids_scores:#?}\")" +--- +[ + ( + 2, + [ + Fid( + Rank { + rank: 19, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), + ( + 6, + [ + Fid( + Rank { + rank: 15, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 5, + [ + Fid( + Rank { + rank: 14, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 4, + [ + Fid( + Rank { + rank: 13, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 3, + [ + Fid( + Rank { + rank: 12, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 83, + max_rank: 91, + }, + ), + ], + ), + ( + 9, + [ + Fid( + Rank { + rank: 11, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 8, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 7, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 73, + max_rank: 91, + }, + ), + ], + ), + ( + 11, + [ + Fid( + Rank { + rank: 7, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 10, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 13, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 12, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 78, + max_rank: 91, + }, + ), + ], + ), + ( + 14, + [ + Fid( + Rank { + rank: 5, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 0, + [ + Fid( + Rank { + rank: 1, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), +] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6875e6f47..2e8ac157c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,7 +12,6 @@ use time::OffsetDateTime; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; -use crate::documents::FieldIdMapper; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::order_by_map::OrderByMap; @@ -1562,8 +1561,9 @@ mod tests { // we must find the appropriate document. let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); assert_eq!(documents.len(), 1); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); drop(rtxn); // We change the searchable fields to be the "name" field only. @@ -1575,12 +1575,16 @@ mod tests { // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); + // the searchable fields should contain all the fields let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); + snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); } #[test] From 9fffb8e83dd13cbe2d88655a258e5391b648d01e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 17:20:57 +0200 Subject: [PATCH 17/56] make clippy happy --- index-scheduler/src/utils.rs | 4 ++-- meilisearch/src/search.rs | 2 +- milli/src/fieldids_weights_map.rs | 2 +- milli/src/index.rs | 8 ++++---- milli/src/search/new/bucket_sort.rs | 4 ++-- milli/src/search/new/mod.rs | 3 +-- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 9f6f90db2..260ff6ee4 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { } for index_uid in index_uids { if index_uid == swap.0 { - *index_uid = swap.1.to_owned(); + swap.1.clone_into(index_uid); } else if index_uid == swap.1 { - *index_uid = swap.0.to_owned(); + swap.0.clone_into(index_uid); } } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a383434a2..34ebe463d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -730,7 +730,7 @@ pub fn perform_search( let mut ids = BTreeSet::new(); for attr in attrs { if attr == "*" { - ids = displayed_ids.clone(); + ids.clone_from(&displayed_ids); break; } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index bead160e9..fdfe8fba2 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -26,7 +26,7 @@ impl FieldidsWeightsMap { self.map.values().copied().max() } - pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 49f78f3cd..7fe9da0ff 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -632,7 +632,7 @@ impl Index { // Special case if there is no user defined fields. // Then the whole field id map is marked as searchable. if user_fields.is_none() { - let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut weights = self.fieldids_weights_map(wtxn)?; let mut searchable = Vec::new(); for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { searchable.push(name); @@ -648,7 +648,7 @@ impl Index { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; - let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut weights = self.fieldids_weights_map(wtxn)?; // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. @@ -666,7 +666,7 @@ impl Index { let weight: u16 = weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight as u16); + weights.insert(id, weight); } } } @@ -701,7 +701,7 @@ impl Index { self.main .remap_types::>>() .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? - .map(|fields| Ok(fields.into_iter().map(|field| Cow::Borrowed(field)).collect())) + .map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect())) .unwrap_or_else(|| { Ok(self .fields_ids_map(rtxn)? diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 521fcb983..e9bc5449d 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); + ranking_rule_universes[0].clone_from(universe); let mut cur_ranking_rule_index = 0; /// Finish iterating over the current ranking rule, yielding @@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 9a2ff5b02..b7514cbb5 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -178,8 +178,7 @@ pub struct SearchableFids { impl SearchableFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.iter().find(|(id, _)| id == fid).is_some() - || self.exact.iter().find(|(id, _)| id == fid).is_some() + self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } } From 7ec4e2a3fbb89821f3a153a9adee05e405183720 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 15:02:26 +0200 Subject: [PATCH 18/56] apply all style review comments --- meilisearch/src/search_queue.rs | 12 +++++---- milli/src/error.rs | 2 ++ milli/src/fieldids_weights_map.rs | 9 +++++++ milli/src/index.rs | 27 ++++++++++--------- .../search/new/ranking_rule_graph/fid/mod.rs | 11 ++++---- milli/src/update/settings.rs | 20 +++++--------- 6 files changed, 45 insertions(+), 36 deletions(-) diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 0fe9a5a53..415da0c15 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,11 +85,13 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { - if search_request.is_none() { - continue; - } - // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web - let search_request = search_request.unwrap(); + let search_request = match search_request { + Some(search_request) => search_request, + // This should never happen while actix-web is running, but it's not a reason to crash + // and it can generate a lot of noise in the tests. + None => continue, + }; + if searches_running < usize::from(parallelism) && queue.is_empty() { searches_running += 1; // if the search requests die it's not a hard error on our side diff --git a/milli/src/error.rs b/milli/src/error.rs index e4550de1f..009781fcf 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -32,6 +32,8 @@ pub enum InternalError { DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + #[error("Missing {key} in the fieldids weights mapping.")] + FieldidsWeightsMapMissingEntry { key: FieldId }, #[error(transparent)] FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), #[error("Missing {key} in the field id mapping.")] diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index fdfe8fba2..72720a02a 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -1,3 +1,5 @@ +//! The fieldids weights map is in charge of storing linking the searchable fields with their weights. + use std::collections::HashMap; use serde::{Deserialize, Serialize}; @@ -10,22 +12,29 @@ pub struct FieldidsWeightsMap { } impl FieldidsWeightsMap { + /// Insert a field id -> weigth into the map. + /// If the map did not have this key present, `None` is returned. + /// If the map did have this key present, the value is updated, and the old value is returned. pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { self.map.insert(fid, weight) } + /// Removes a field id from the map, returning the associated weight previously in the map. pub fn remove(&mut self, fid: FieldId) -> Option { self.map.remove(&fid) } + /// Returns weight corresponding to the key. pub fn weight(&self, fid: FieldId) -> Option { self.map.get(&fid).copied() } + /// Returns highest weight contained in the map if any. pub fn max_weight(&self) -> Option { self.map.values().copied().max() } + /// Return an iterator visiting all field ids in arbitrary order. pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() } diff --git a/milli/src/index.rs b/milli/src/index.rs index 7fe9da0ff..c565cdd5b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -26,9 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, - GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, - Weight, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, + FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -446,22 +446,25 @@ impl Index { pub fn searchable_fields_and_weights<'a>( &self, rtxn: &'a RoTxn, - ) -> heed::Result, FieldId, Weight)>> { + ) -> Result, FieldId, Weight)>> { let fid_map = self.fields_ids_map(rtxn)?; let weight_map = self.fieldids_weights_map(rtxn)?; let searchable = self.searchable_fields(rtxn)?; - Ok(searchable + searchable .into_iter() - .map(|field| { - // the searchable attributes are a subset of the field id map - let fid = fid_map.id(&field).unwrap(); - // all the searchable fields have a weight - let weight = weight_map.weight(fid).unwrap(); + .map(|field| -> Result<_> { + let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field.to_string(), + process: "searchable_fields_and_weights", + })?; + let weight = weight_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; - (field, fid, weight) + Ok((field, fid, weight)) }) - .collect()) + .collect() } /* geo rtree */ diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index e10f2fbab..a4a08ea46 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,7 +7,7 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::{FieldId, Result}; +use crate::{FieldId, InternalError, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { @@ -29,10 +29,9 @@ impl RankingRuleGraphTrait for FidGraph { let docids = if let Some(fid) = condition.fid { // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = + let docids = compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; - docids &= universe; - docids + docids & universe } else { RoaringBitmap::new() }; @@ -75,7 +74,9 @@ impl RankingRuleGraphTrait for FidGraph { let mut edges = vec![]; for fid in all_fields.iter().copied() { - let weight = weights_map.weight(fid).unwrap(); + let weight = weights_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; edges.push(( weight as u32 * term.term_ids.len() as u32, conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 2e8ac157c..c66148813 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -475,33 +475,25 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (distinct_fields, filterable_fields). - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut new_fields_ids_map = old_fields_ids_map.clone(); - let names = fields - .iter() - // fields are deduplicated, only the first occurrence is taken into account - .unique() - .map(String::as_str) - .collect::>(); + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + // fields are deduplicated, only the first occurrence is taken into account + let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { // The fields ids map won't change the field id of already present elements thus only the // new fields will be inserted. - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, Some(&names), - &new_fields_ids_map, + &fields_ids_map, )?; - self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), From ad4d8502b3583f734f7508dea2e14656a8dea946 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 17:16:10 +0200 Subject: [PATCH 19/56] stops storing the whole fieldids weights map when no searchable are defined --- milli/src/fieldids_weights_map.rs | 9 ++++++- milli/src/index.rs | 36 ++++++++++++---------------- milli/src/update/settings.rs | 40 +++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 29 deletions(-) diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 72720a02a..5ca2a6146 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{FieldId, Weight}; +use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { @@ -19,6 +19,13 @@ impl FieldidsWeightsMap { self.map.insert(fid, weight) } + /// Create the map from the fields ids maps. + /// Should only be called in the case there are NO searchable attributes. + /// The weights and the fields ids will have the same values. + pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, fid)).collect() } + } + /// Removes a field id from the map, returning the associated weight previously in the map. pub fn remove(&mut self, fid: FieldId) -> Option { self.map.remove(&fid) diff --git a/milli/src/index.rs b/milli/src/index.rs index c565cdd5b..36f0b339e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -436,11 +436,20 @@ impl Index { /// Get the fieldids weights map which associates the field ids to their weights pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self - .main + self.main .remap_types::>() .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? - .unwrap_or_default()) + .map(Ok) + .unwrap_or_else(|| { + Ok(FieldidsWeightsMap::from_field_id_map_without_searchable( + &self.fields_ids_map(rtxn)?, + )) + }) + } + + /// Delete the fieldsids weights map + pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) } pub fn searchable_fields_and_weights<'a>( @@ -629,29 +638,13 @@ impl Index { pub(crate) fn put_all_searchable_fields_from_fields_ids_map( &self, wtxn: &mut RwTxn, - user_fields: Option<&[&str]>, + user_fields: &[&str], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { - // Special case if there is no user defined fields. - // Then the whole field id map is marked as searchable. - if user_fields.is_none() { - let mut weights = self.fieldids_weights_map(wtxn)?; - let mut searchable = Vec::new(); - for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { - searchable.push(name); - weights.insert(fid, weight as u16); - } - self.put_searchable_fields(wtxn, &searchable)?; - self.put_fieldids_weights_map(wtxn, &weights)?; - return Ok(()); - } - - let user_fields = user_fields.unwrap(); - // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; - let mut weights = self.fieldids_weights_map(wtxn)?; + let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. @@ -682,6 +675,7 @@ impl Index { pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { let did_delete_searchable = self.delete_searchable_fields(wtxn)?; let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + self.delete_fieldids_weights_map(wtxn)?; Ok(did_delete_searchable || did_delete_user_defined) } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c66148813..046644dc4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -490,7 +490,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, - Some(&names), + &names, &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -1228,11 +1228,13 @@ impl InnerIndexSettings { .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); // in case new fields were introduced we're going to recreate the searchable fields. - index.put_all_searchable_fields_from_fields_ids_map( - wtxn, - searchable_fields.as_deref(), - &self.fields_ids_map, - )?; + if let Some(searchable_fields) = searchable_fields { + index.put_all_searchable_fields_from_fields_ids_map( + wtxn, + &searchable_fields, + &self.fields_ids_map, + )?; + } let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; self.searchable_fields_ids = searchable_fields_ids; @@ -1513,7 +1515,7 @@ mod tests { use crate::error::Error; use crate::index::tests::TempIndex; use crate::update::ClearDocuments; - use crate::{Criterion, Filter, SearchResult}; + use crate::{db_snap, Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { @@ -1542,6 +1544,17 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + "###); + // Check that the searchable field is correctly set to "name" only. let rtxn = index.read_txn().unwrap(); // When we search for something that is not in @@ -1565,6 +1578,19 @@ mod tests { }) .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + 2 2 | + "###); + // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); let fid_map = index.fields_ids_map(&rtxn).unwrap(); From 5542f1d9f11c2c6d7ad691af11ed1b5177a13168 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 18:00:39 +0200 Subject: [PATCH 20/56] get back to what we were doingb efore in the DB cache and with the restricted field id --- milli/src/search/new/db_cache.rs | 140 ++++++++++++++++++++----------- milli/src/search/new/mod.rs | 19 ++--- 2 files changed, 99 insertions(+), 60 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 4985f55e9..4fa0765e0 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -159,36 +159,58 @@ impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids(&mut self, word: Interned) -> Result> { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_docids( &mut self, word: Interned, ) -> Result> { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.exact_word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.exact_word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.exact_word_docids, + self.index.exact_word_docids.remap_data_type::(), + ), + } } pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { @@ -216,36 +238,58 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.exact_word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.exact_word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.exact_word_prefix_docids, + self.index.exact_word_prefix_docids.remap_data_type::(), + ), + } } pub fn get_db_word_pair_proximity_docids( @@ -421,8 +465,8 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the list of searchable, return None. - if !self.searchable_fids.contains(&fid) { + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { return Ok(None); } @@ -440,8 +484,8 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the searchable list, return None. - if !self.searchable_fids.contains(&fid) { + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { return Ok(None); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b7514cbb5..2cea96fce 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -66,7 +66,7 @@ pub struct SearchContext<'ctx> { pub phrase_interner: DedupInterner, pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, - pub searchable_fids: SearchableFids, + pub restricted_fids: Option, } impl<'ctx> SearchContext<'ctx> { @@ -92,7 +92,7 @@ impl<'ctx> SearchContext<'ctx> { phrase_interner: <_>::default(), term_interner: <_>::default(), phrase_docids: <_>::default(), - searchable_fids: SearchableFids { tolerant, exact }, + restricted_fids: None, }) } @@ -103,7 +103,7 @@ impl<'ctx> SearchContext<'ctx> { let mut wildcard = false; - let mut restricted_fids = SearchableFids::default(); + let mut restricted_fids = RestrictedFids::default(); for field_name in attributes_to_search_on { if field_name == "*" { wildcard = true; @@ -141,14 +141,9 @@ impl<'ctx> SearchContext<'ctx> { } if wildcard { - let (exact, tolerant) = searchable_names - .iter() - .map(|(_name, fid, weight)| (*fid, *weight)) - .partition(|(fid, _weight)| exact_attributes_ids.contains(fid)); - - self.searchable_fids = SearchableFids { tolerant, exact }; + self.restricted_fids = None; } else { - self.searchable_fids = restricted_fids; + self.restricted_fids = Some(restricted_fids); } Ok(()) @@ -171,12 +166,12 @@ impl Word { } #[derive(Debug, Clone, Default)] -pub struct SearchableFids { +pub struct RestrictedFids { pub tolerant: Vec<(FieldId, Weight)>, pub exact: Vec<(FieldId, Weight)>, } -impl SearchableFids { +impl RestrictedFids { pub fn contains(&self, fid: &FieldId) -> bool { self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } From c78a2fa4f5dfdf9dc487d32ce7df6a52a2b02c64 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 18:04:42 +0200 Subject: [PATCH 21/56] rename method and variable around the attributes to search on feature --- milli/src/search/mod.rs | 2 +- milli/src/search/new/mod.rs | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7427db3a1..ca0eda49e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -158,7 +158,7 @@ impl<'a> Search<'a> { let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { - ctx.searchable_attributes(searchable_attributes)?; + ctx.attributes_to_search_on(searchable_attributes)?; } let universe = filtered_universe(&ctx, &self.filter)?; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 2cea96fce..5e4c2f829 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,9 +96,12 @@ impl<'ctx> SearchContext<'ctx> { }) } - pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { + pub fn attributes_to_search_on( + &mut self, + attributes_to_search_on: &'ctx [String], + ) -> Result<()> { let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; - let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; + let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; let mut wildcard = false; @@ -110,7 +113,8 @@ impl<'ctx> SearchContext<'ctx> { // we cannot early exit as we want to returns error in case of unknown fields continue; } - let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); + let searchable_weight = + searchable_fields_weights.iter().find(|(name, _, _)| name == field_name); let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable Some((_name, fid, weight)) => (*fid, *weight), @@ -120,7 +124,7 @@ impl<'ctx> SearchContext<'ctx> { None => { let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( self.txn, - searchable_names.iter().map(|(name, _, _)| name), + searchable_fields_weights.iter().map(|(name, _, _)| name), )?; let field = field_name.to_string(); From f2d0a59f1da3a83875e57a38fb5c45e0af993b3f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 01:06:33 +0200 Subject: [PATCH 22/56] when no searchable attributes are defined, makes all the weight equals to zero --- meilisearch/tests/search/hybrid.rs | 8 ++++---- meilisearch/tests/search/mod.rs | 2 +- meilisearch/tests/search/restrict_searchable.rs | 4 ++-- milli/src/fieldids_weights_map.rs | 4 ++-- milli/src/index.rs | 6 +++--- milli/src/update/settings.rs | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 68ae4c0aa..67f7909b9 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -85,8 +85,8 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); - snapshot!(response["semanticHitCount"], @"1"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( @@ -331,7 +331,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -374,6 +374,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b4350f686..f601e2b03 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -921,7 +921,7 @@ async fn test_score_details() { "order": 3, "attributeRankingOrderScore": 1.0, "queryWordDistanceScore": 0.8095238095238095, - "score": 0.9727891156462584 + "score": 0.8095238095238095 }, "exactness": { "order": 4, diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs index 7bbdca38f..f52efa1f4 100644 --- a/meilisearch/tests/search/restrict_searchable.rs +++ b/meilisearch/tests/search/restrict_searchable.rs @@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() { @r###" [ { - "id": "2" + "id": "1" }, { - "id": "1" + "id": "2" } ] "### diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 5ca2a6146..a737632a4 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -21,9 +21,9 @@ impl FieldidsWeightsMap { /// Create the map from the fields ids maps. /// Should only be called in the case there are NO searchable attributes. - /// The weights and the fields ids will have the same values. + /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, fid)).collect() } + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/index.rs b/milli/src/index.rs index 36f0b339e..42b9cb111 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2492,7 +2492,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); index.delete_documents(Default::default()); @@ -2512,7 +2512,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); index @@ -2537,7 +2537,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 046644dc4..0599bb9d8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1587,8 +1587,8 @@ mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | - 2 2 | + 1 0 | + 2 0 | "###); // Check that the searchable field have been reset and documents are found now. From 673b6e1dc0f9ad6d688c5f8da7295d1f4e041c5f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 11:28:14 +0200 Subject: [PATCH 23/56] fix a flaky test --- meilisearch/tests/snapshot/mod.rs | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/snapshot/mod.rs b/meilisearch/tests/snapshot/mod.rs index 1312aa9ca..67e80f45b 100644 --- a/meilisearch/tests/snapshot/mod.rs +++ b/meilisearch/tests/snapshot/mod.rs @@ -1,6 +1,5 @@ use std::time::Duration; -use actix_rt::time::sleep; use meili_snap::{json_string, snapshot}; use meilisearch::option::ScheduleSnapshot; use meilisearch::Opt; @@ -53,11 +52,29 @@ async fn perform_snapshot() { index.load_test_set().await; - server.index("test1").create(Some("prim")).await; + let (task, code) = server.index("test1").create(Some("prim")).await; + meili_snap::snapshot!(code, @"202 Accepted"); - index.wait_task(2).await; + index.wait_task(task.uid()).await; - sleep(Duration::from_secs(2)).await; + // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point + + println!("waited for the next task to finish"); + let now = std::time::Instant::now(); + let next_task = task.uid() + 1; + loop { + let (value, code) = index.get_task(next_task).await; + dbg!(&value); + if code != 404 && value["status"].as_str() == Some("succeeded") { + break; + } + + if now.elapsed() > Duration::from_secs(30) { + panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}", + serde_json::to_string_pretty(&value).unwrap() + ); + } + } let temp = tempfile::tempdir().unwrap(); From 8e6ffbfc6f55580784d9322af0453b874fe5cb0e Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 28 Mar 2024 18:22:31 +0100 Subject: [PATCH 24/56] stream documents --- Cargo.lock | 12 +-- meilisearch/Cargo.toml | 1 + meilisearch/src/routes/indexes/documents.rs | 114 ++++++++++++++------ meilisearch/src/routes/mod.rs | 28 +++-- 4 files changed, 107 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 937fce64a..5d87830a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3348,6 +3348,7 @@ dependencies = [ "rayon", "regex", "reqwest", + "roaring", "rustls 0.21.12", "rustls-pemfile", "segment", @@ -4416,12 +4417,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "ring" version = "0.17.8" @@ -4439,13 +4434,12 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +checksum = "a1c77081a55300e016cb86f2864415b7518741879db925b8d488a0ee0d2da6bf" dependencies = [ "bytemuck", "byteorder", - "retain_mut", "serde", ] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index ed62c5f48..612c6731b 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -108,6 +108,7 @@ tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.9" build-info = { version = "1.7.0", path = "../build-info" } +roaring = "0.10.3" [dev-dependencies] actix-rt = "2.9.0" diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 43fab1dae..78af7a098 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,12 +1,14 @@ -use std::io::ErrorKind; +use std::io::{ErrorKind, Write}; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; use actix_web::{web, HttpMessage, HttpRequest, HttpResponse}; use bstr::ByteSlice as _; +use bytes::Bytes; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::Deserr; use futures::StreamExt; +use futures_util::Stream; use index_scheduler::{IndexScheduler, TaskId}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; @@ -22,7 +24,9 @@ use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; -use serde::Deserialize; +use roaring::RoaringBitmap; +use serde::ser::SerializeSeq; +use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; @@ -230,6 +234,34 @@ pub async fn get_documents( documents_by_query(&index_scheduler, index_uid, query) } +pub struct Writer2Streamer { + sender: tokio::sync::mpsc::Sender>, +} + +impl Write for Writer2Streamer { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.sender.blocking_send(Ok(buf.to_vec().into())).map_err(std::io::Error::other)?; + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +pub fn stream( + data: impl Serialize + Send + Sync + 'static, +) -> impl Stream> { + let (sender, receiver) = tokio::sync::mpsc::channel::>(1); + + tokio::task::spawn_blocking(move || { + serde_json::to_writer(std::io::BufWriter::new(Writer2Streamer { sender }), &data) + }); + futures_util::stream::unfold(receiver, |mut receiver| async { + receiver.recv().await.map(|value| (value, receiver)) + }) +} + fn documents_by_query( index_scheduler: &IndexScheduler, index_uid: web::Path, @@ -239,12 +271,13 @@ fn documents_by_query( let BrowseQuery { offset, limit, fields, filter } = query; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; + let documents = retrieve_documents(index, offset, limit, filter, fields)?; - let ret = PaginationView::new(offset, limit, total as usize, documents); + let ret = PaginationView::new(offset, limit, documents.total_documents as usize, documents); debug!(returns = ?ret, "Get documents"); - Ok(HttpResponse::Ok().json(ret)) + + Ok(HttpResponse::Ok().streaming(stream(ret))) } #[derive(Deserialize, Debug, Deserr)] @@ -590,13 +623,46 @@ fn some_documents<'a, 't: 'a>( })) } -fn retrieve_documents>( - index: &Index, +pub struct DocumentsStreamer { + attributes_to_retrieve: Option>, + documents: RoaringBitmap, + index: Index, + pub total_documents: u64, +} + +impl Serialize for DocumentsStreamer { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let rtxn = self.index.read_txn().unwrap(); + + let mut seq = serializer.serialize_seq(Some(self.documents.len() as usize)).unwrap(); + + let documents = some_documents(&self.index, &rtxn, self.documents.iter()).unwrap(); + for document in documents { + let document = document.unwrap(); + let document = match self.attributes_to_retrieve { + Some(ref attributes_to_retrieve) => permissive_json_pointer::select_values( + &document, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document, + }; + + seq.serialize_element(&document)?; + } + seq.end() + } +} + +fn retrieve_documents( + index: Index, offset: usize, limit: usize, filter: Option, - attributes_to_retrieve: Option>, -) -> Result<(u64, Vec), ResponseError> { + attributes_to_retrieve: Option>, +) -> Result { let rtxn = index.read_txn()?; let filter = &filter; let filter = if let Some(filter) = filter { @@ -607,7 +673,7 @@ fn retrieve_documents>( }; let candidates = if let Some(filter) = filter { - filter.evaluate(&rtxn, index).map_err(|err| match err { + filter.evaluate(&rtxn, &index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter) } @@ -616,28 +682,14 @@ fn retrieve_documents>( } else { index.documents_ids(&rtxn)? }; + drop(rtxn); - let (it, number_of_documents) = { - let number_of_documents = candidates.len(); - ( - some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, - number_of_documents, - ) - }; - - let documents: Result, ResponseError> = it - .map(|document| { - Ok(match &attributes_to_retrieve { - Some(attributes_to_retrieve) => permissive_json_pointer::select_values( - &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), - ), - None => document?, - }) - }) - .collect(); - - Ok((number_of_documents, documents?)) + Ok(DocumentsStreamer { + total_documents: candidates.len(), + attributes_to_retrieve, + documents: candidates.into_iter().skip(offset).take(limit).collect(), + index, + }) } fn retrieve_document>( diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index c25aeee70..a7e84d19c 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -1,4 +1,5 @@ use std::collections::BTreeMap; +use std::fmt; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; @@ -124,20 +125,31 @@ pub struct Pagination { pub limit: usize, } -#[derive(Debug, Clone, Serialize)] -pub struct PaginationView { - pub results: Vec, +#[derive(Clone, Serialize)] +pub struct PaginationView { + pub results: T, pub offset: usize, pub limit: usize, pub total: usize, } +impl fmt::Debug for PaginationView { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PaginationView") + .field("offset", &self.offset) + .field("limit", &self.limit) + .field("total", &self.total) + .field("results", &"[...]") + .finish() + } +} + impl Pagination { /// Given the full data to paginate, returns the selected section. pub fn auto_paginate_sized( self, content: impl IntoIterator + ExactSizeIterator, - ) -> PaginationView + ) -> PaginationView> where T: Serialize, { @@ -151,7 +163,7 @@ impl Pagination { self, total: usize, content: impl IntoIterator, - ) -> PaginationView + ) -> PaginationView> where T: Serialize, { @@ -161,7 +173,7 @@ impl Pagination { /// Given the data already paginated + the total number of elements, it stores /// everything in a [PaginationResult]. - pub fn format_with(self, total: usize, results: Vec) -> PaginationView + pub fn format_with(self, total: usize, results: Vec) -> PaginationView> where T: Serialize, { @@ -169,8 +181,8 @@ impl Pagination { } } -impl PaginationView { - pub fn new(offset: usize, limit: usize, total: usize, results: Vec) -> Self { +impl PaginationView { + pub fn new(offset: usize, limit: usize, total: usize, results: T) -> Self { Self { offset, limit, results, total } } } From c85d1752dd3937ffdfc8f86f16108bfa9388aaac Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 18 Apr 2024 15:51:46 +0200 Subject: [PATCH 25/56] keep the same rtxn to compute the filters on the documents and to stream the documents later on --- meilisearch/src/routes/indexes/documents.rs | 28 +++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 78af7a098..9d34fcdfe 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,5 @@ use std::io::{ErrorKind, Write}; +use std::pin::Pin; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -250,7 +251,7 @@ impl Write for Writer2Streamer { } pub fn stream( - data: impl Serialize + Send + Sync + 'static, + data: impl Serialize + Send + 'static, ) -> impl Stream> { let (sender, receiver) = tokio::sync::mpsc::channel::>(1); @@ -626,20 +627,31 @@ fn some_documents<'a, 't: 'a>( pub struct DocumentsStreamer { attributes_to_retrieve: Option>, documents: RoaringBitmap, - index: Index, + // safety: The `rtxn` contains a reference to the index thus: + // - The `rtxn` MUST BE dropped before the index. + // - The index MUST BE `Pin`ned in RAM and never moved. + rtxn: Option>, + index: Pin>, pub total_documents: u64, } +impl Drop for DocumentsStreamer { + fn drop(&mut self) { + // safety: we drop the rtxn before the index + self.rtxn = None; + } +} + impl Serialize for DocumentsStreamer { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - let rtxn = self.index.read_txn().unwrap(); + let rtxn = self.rtxn.as_ref().unwrap(); let mut seq = serializer.serialize_seq(Some(self.documents.len() as usize)).unwrap(); - let documents = some_documents(&self.index, &rtxn, self.documents.iter()).unwrap(); + let documents = some_documents(&self.index, rtxn, self.documents.iter()).unwrap(); for document in documents { let document = document.unwrap(); let document = match self.attributes_to_retrieve { @@ -663,7 +675,10 @@ fn retrieve_documents( filter: Option, attributes_to_retrieve: Option>, ) -> Result { + // safety: The index MUST NOT move while we hold the `rtxn` on it + let index = Box::pin(index); let rtxn = index.read_txn()?; + let filter = &filter; let filter = if let Some(filter) = filter { parse_filter(filter) @@ -682,12 +697,15 @@ fn retrieve_documents( } else { index.documents_ids(&rtxn)? }; - drop(rtxn); Ok(DocumentsStreamer { total_documents: candidates.len(), attributes_to_retrieve, documents: candidates.into_iter().skip(offset).take(limit).collect(), + // safety: It is safe to make the lifetime in the Rtxn static because it points to the index right below. + // The index is `Pin`ned on the RAM and won't move even if the structure is moved. + // The `rtxn` is held in an `Option`, so we're able to drop it before dropping the index. + rtxn: Some(unsafe { std::mem::transmute(rtxn) }), index, }) } From 897d25780ef7a442d4bc1ac2599eea49dcf75448 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 16:10:55 +0200 Subject: [PATCH 26/56] update milli to latest version --- Cargo.lock | 26 +++++-------------- index-scheduler/src/batch.rs | 10 ++++--- index-scheduler/src/lib.rs | 14 +++++----- meilisearch-auth/src/store.rs | 2 +- meilisearch-types/src/error.rs | 1 - meilitool/src/main.rs | 8 ++---- milli/Cargo.toml | 10 +++++-- milli/fuzz/.gitignore | 3 +++ milli/src/error.rs | 3 --- milli/src/index.rs | 7 ++++- milli/src/update/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 3 +-- 13 files changed, 44 insertions(+), 47 deletions(-) create mode 100644 milli/fuzz/.gitignore diff --git a/Cargo.lock b/Cargo.lock index 5d87830a5..7df0e7e86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,9 +378,7 @@ dependencies = [ [[package]] name = "arroy" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efddeb1e7c32a551cc07ef4c3e181e3cd5478fdaf4f0bd799983171c1f6efe57" +version = "0.3.0" dependencies = [ "bytemuck", "byteorder", @@ -1536,9 +1534,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "doxygen-rs" -version = "0.2.2" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" +checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9" dependencies = [ "phf", ] @@ -2262,12 +2260,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.0-alpha.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" +version = "0.20.0" dependencies = [ "bitflags 2.5.0", - "bytemuck", "byteorder", "heed-traits", "heed-types", @@ -2281,15 +2276,11 @@ dependencies = [ [[package]] name = "heed-traits" -version = "0.20.0-alpha.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" +version = "0.20.0" [[package]] name = "heed-types" -version = "0.20.0-alpha.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" +version = "0.20.0" dependencies = [ "bincode", "byteorder", @@ -3189,14 +3180,11 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" +version = "0.2.0" dependencies = [ "cc", "doxygen-rs", "libc", - "pkg-config", ] [[package]] diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index bc9823a01..582497c15 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -785,10 +785,12 @@ impl IndexScheduler { let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should - let auth = milli::heed::EnvOpenOptions::new() - .map_size(1024 * 1024 * 1024) // 1 GiB - .max_dbs(2) - .open(&self.auth_path)?; + let auth = unsafe { + milli::heed::EnvOpenOptions::new() + .map_size(1024 * 1024 * 1024) // 1 GiB + .max_dbs(2) + .open(&self.auth_path) + }?; auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5704f5354..dd2b296f6 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -453,10 +453,12 @@ impl IndexScheduler { ) }; - let env = heed::EnvOpenOptions::new() - .max_dbs(11) - .map_size(budget.task_db_size) - .open(options.tasks_path)?; + let env = unsafe { + heed::EnvOpenOptions::new() + .max_dbs(11) + .map_size(budget.task_db_size) + .open(options.tasks_path) + }?; let features = features::FeatureData::new(&env, options.instance_features)?; @@ -585,9 +587,9 @@ impl IndexScheduler { } fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool { - if let Ok(env) = + if let Ok(env) = unsafe { heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path) - { + } { env.prepare_for_closing().wait(); true } else { diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 1eebd3fe9..ef992e836 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -49,7 +49,7 @@ pub fn open_auth_store_env(path: &Path) -> milli::heed::Result let mut options = EnvOpenOptions::new(); options.map_size(AUTH_STORE_SIZE); // 1GB options.max_dbs(2); - options.open(path) + unsafe { options.open(path) } } impl HeedAuthStore { diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index eea012331..158dfae92 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -423,7 +423,6 @@ impl ErrorCode for HeedError { HeedError::Mdb(_) | HeedError::Encoding(_) | HeedError::Decoding(_) - | HeedError::InvalidDatabaseTyping | HeedError::DatabaseClosing | HeedError::BadOpenOptions { .. } => Code::Internal, } diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index bfcbfdd6d..06c4890a5 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -80,9 +80,7 @@ fn main() -> anyhow::Result<()> { /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) } .with_context(|| format!("While trying to open {:?}", path.display()))?; eprintln!("Deleting tasks from the database..."); @@ -193,9 +191,7 @@ fn export_a_dump( FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; let index_scheduler_path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&index_scheduler_path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; eprintln!("Dumping the keys..."); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7d903178b..ab63a1fa7 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,7 +30,12 @@ grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } -heed = { version = "0.20.0-alpha.9", default-features = false, features = [ +# heed = { version = "0.20.0", default-features = false, features = [ +# "serde-json", +# "serde-bincode", +# "read-txn-no-tls", +# ] } +heed = { path = "../../heed/heed", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", @@ -82,7 +87,8 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.2.0" +# arroy = "0.2.0" +arroy = { path = "../../arroy" } rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore new file mode 100644 index 000000000..a0925114d --- /dev/null +++ b/milli/fuzz/.gitignore @@ -0,0 +1,3 @@ +target +corpus +artifacts diff --git a/milli/src/error.rs b/milli/src/error.rs index 009781fcf..6db0dcac1 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -48,8 +48,6 @@ pub enum InternalError { GrenadInvalidFormatVersion, #[error("Invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, - #[error("{}", HeedError::InvalidDatabaseTyping)] - InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), #[error(transparent)] @@ -429,7 +427,6 @@ impl From for Error { // TODO use the encoding HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), - HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), } diff --git a/milli/src/index.rs b/milli/src/index.rs index 42b9cb111..739a7f202 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -184,7 +184,7 @@ impl Index { options.max_dbs(25); - let env = options.open(path)?; + let env = unsafe { options.open(path) }?; let mut wtxn = env.write_txn()?; let main = env.database_options().name(MAIN).create(&mut wtxn)?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; @@ -294,6 +294,11 @@ impl Index { self.env.read_txn() } + /// Create a static read transaction to be able to read the index without keeping a reference to it. + pub fn static_read_txn(&self) -> heed::Result> { + self.env.clone().static_read_txn() + } + /// Returns the canonicalized path where the heed `Env` of this `Index` lives. pub fn path(&self) -> &Path { self.env.path() diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 0af64c4c5..42994551f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -379,7 +379,7 @@ pub(crate) mod test_helpers { let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); + let env = unsafe { options.open(tempdir.path()) }.unwrap(); let mut wtxn = env.write_txn().unwrap(); let content = env.create_database(&mut wtxn, None).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 936ce1efc..4d2fac7cb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -556,7 +556,7 @@ where let writer_index = (embedder_index as u16) << 8; for k in 0..=u8::MAX { let writer = - arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?; + arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension); if writer.is_empty(wtxn)? { break; } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6aad290e5..e0de2d5a1 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -661,7 +661,7 @@ pub(crate) fn write_typed_chunk_into_index( )?; let writer_index = (embedder_index as u16) << 8; // FIXME: allow customizing distance - let writers: std::result::Result, _> = (0..=u8::MAX) + let writers: Vec<_> = (0..=u8::MAX) .map(|k| { arroy::Writer::new( index.vector_arroy, @@ -670,7 +670,6 @@ pub(crate) fn write_typed_chunk_into_index( ) }) .collect(); - let writers = writers?; // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); From 273c6e8c5c28573af67b44d7d1f13a043a7b7915 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 16:11:08 +0200 Subject: [PATCH 27/56] uses the latest version of heed to get rid of unsafe code --- Cargo.lock | 14 +++++++++-- meilisearch/src/routes/indexes/documents.rs | 28 ++++----------------- milli/Cargo.toml | 10 ++------ 3 files changed, 19 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7df0e7e86..d9e96b029 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,7 +378,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.3.0" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" dependencies = [ "bytemuck", "byteorder", @@ -2260,7 +2262,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.0" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" dependencies = [ "bitflags 2.5.0", "byteorder", @@ -2277,10 +2281,14 @@ dependencies = [ [[package]] name = "heed-traits" version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" [[package]] name = "heed-types" version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb0d6ba3700c9a57e83c013693e3eddb68a6d9b6781cacafc62a0d992e8ddb3" dependencies = [ "bincode", "byteorder", @@ -3181,6 +3189,8 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" dependencies = [ "cc", "doxygen-rs", diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 9d34fcdfe..7c9b4b761 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,5 +1,4 @@ use std::io::{ErrorKind, Write}; -use std::pin::Pin; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -627,31 +626,19 @@ fn some_documents<'a, 't: 'a>( pub struct DocumentsStreamer { attributes_to_retrieve: Option>, documents: RoaringBitmap, - // safety: The `rtxn` contains a reference to the index thus: - // - The `rtxn` MUST BE dropped before the index. - // - The index MUST BE `Pin`ned in RAM and never moved. - rtxn: Option>, - index: Pin>, + rtxn: RoTxn<'static>, + index: Index, pub total_documents: u64, } -impl Drop for DocumentsStreamer { - fn drop(&mut self) { - // safety: we drop the rtxn before the index - self.rtxn = None; - } -} - impl Serialize for DocumentsStreamer { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - let rtxn = self.rtxn.as_ref().unwrap(); - let mut seq = serializer.serialize_seq(Some(self.documents.len() as usize)).unwrap(); - let documents = some_documents(&self.index, rtxn, self.documents.iter()).unwrap(); + let documents = some_documents(&self.index, &self.rtxn, self.documents.iter()).unwrap(); for document in documents { let document = document.unwrap(); let document = match self.attributes_to_retrieve { @@ -675,9 +662,7 @@ fn retrieve_documents( filter: Option, attributes_to_retrieve: Option>, ) -> Result { - // safety: The index MUST NOT move while we hold the `rtxn` on it - let index = Box::pin(index); - let rtxn = index.read_txn()?; + let rtxn = index.static_read_txn()?; let filter = &filter; let filter = if let Some(filter) = filter { @@ -702,10 +687,7 @@ fn retrieve_documents( total_documents: candidates.len(), attributes_to_retrieve, documents: candidates.into_iter().skip(offset).take(limit).collect(), - // safety: It is safe to make the lifetime in the Rtxn static because it points to the index right below. - // The index is `Pin`ned on the RAM and won't move even if the structure is moved. - // The `rtxn` is held in an `Option`, so we're able to drop it before dropping the index. - rtxn: Some(unsafe { std::mem::transmute(rtxn) }), + rtxn, index, }) } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ab63a1fa7..c5dddd0fd 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,12 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } -# heed = { version = "0.20.0", default-features = false, features = [ -# "serde-json", -# "serde-bincode", -# "read-txn-no-tls", -# ] } -heed = { path = "../../heed/heed", default-features = false, features = [ +heed = { version = "0.20.1", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", @@ -87,8 +82,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -# arroy = "0.2.0" -arroy = { path = "../../arroy" } +arroy = "0.3.1" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } From 98c811247e1d8c92523f8f933383063f6e009d5a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:22:16 +0200 Subject: [PATCH 28/56] Add parsed vectors module --- milli/src/vector/mod.rs | 1 + milli/src/vector/parsed_vectors.rs | 149 +++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 milli/src/vector/parsed_vectors.rs diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 306c1c1e9..d3d05a1c1 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -13,6 +13,7 @@ pub mod error; pub mod hf; pub mod manual; pub mod openai; +pub mod parsed_vectors; pub mod settings; pub mod ollama; diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs new file mode 100644 index 000000000..bf4b9ea83 --- /dev/null +++ b/milli/src/vector/parsed_vectors.rs @@ -0,0 +1,149 @@ +use std::collections::BTreeMap; + +use obkv::KvReader; +use serde_json::{from_slice, Value}; + +use super::Embedding; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::{FieldId, InternalError, UserError}; + +pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(untagged)] +pub enum Vectors { + ImplicitlyUserProvided(VectorOrArrayOfVectors), + Explicit(ExplicitVectors), +} + +impl Vectors { + pub fn into_array_of_vectors(self) -> Vec { + match self { + Vectors::ImplicitlyUserProvided(embeddings) + | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { + embeddings.into_array_of_vectors().unwrap_or_default() + } + } + } +} + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct ExplicitVectors { + pub embeddings: VectorOrArrayOfVectors, + pub user_provided: bool, +} + +pub struct ParsedVectorsDiff { + pub old: Option>, + pub new: Option>, +} + +impl ParsedVectorsDiff { + pub fn new( + documents_diff: KvReader<'_, FieldId>, + old_vectors_fid: Option, + new_vectors_fid: Option, + ) -> Result { + let old = match old_vectors_fid + .and_then(|vectors_fid| documents_diff.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) + .transpose() + { + Ok(del) => del, + // ignore wrong shape for old version of documents, use an empty map in this case + Err(Error::InvalidMap(value)) => { + tracing::warn!(%value, "Previous version of the `_vectors` field had a wrong shape"); + Default::default() + } + Err(error) => { + return Err(error); + } + } + .flatten(); + let new = new_vectors_fid + .and_then(|vectors_fid| documents_diff.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) + .transpose()? + .flatten(); + Ok(Self { old, new }) + } + + pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { + let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); + let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + (old, new) + } +} + +pub struct ParsedVectors(pub BTreeMap); + +impl ParsedVectors { + pub fn from_bytes(value: &[u8]) -> Result { + let Ok(value) = from_slice(value) else { + let value = from_slice(value).map_err(Error::InternalSerdeJson)?; + return Err(Error::InvalidMap(value)); + }; + Ok(ParsedVectors(value)) + } + + pub fn retain_user_provided_vectors(&mut self) { + self.0.retain(|_k, v| match v { + Vectors::ImplicitlyUserProvided(_) => true, + Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => *user_provided, + }); + } +} + +pub enum Error { + InvalidMap(Value), + InternalSerdeJson(serde_json::Error), +} + +impl Error { + pub fn to_crate_error(self, document_id: String) -> crate::Error { + match self { + Error::InvalidMap(value) => { + crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) + } + Error::InternalSerdeJson(error) => { + crate::Error::InternalError(InternalError::SerdeJson(error)) + } + } + } +} + +fn to_vector_map( + obkv: KvReaderDelAdd, + side: DelAdd, +) -> Result>, Error> { + Ok(if let Some(value) = obkv.get(side) { + let ParsedVectors(parsed_vectors) = ParsedVectors::from_bytes(value)?; + Some(parsed_vectors) + } else { + None + }) +} + +/// Represents either a vector or an array of multiple vectors. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(transparent)] +pub struct VectorOrArrayOfVectors { + #[serde(with = "either::serde_untagged_optional")] + inner: Option>>, +} + +impl VectorOrArrayOfVectors { + pub fn into_array_of_vectors(self) -> Option> { + match self.inner? { + either::Either::Left(vector) => Some(vec![vector]), + either::Either::Right(vectors) => Some(vectors), + } + } + + pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { + Self { inner: Some(either::Either::Right(array_of_vec)) } + } +} From 261de888b71a3ba4bc891b09e30715e68bf8a812 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:38:28 +0200 Subject: [PATCH 29/56] Add function to get the embeddings of a document in an index --- milli/src/index.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 739a7f202..66cd6f3cc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1606,6 +1606,44 @@ impl Index { pub(crate) fn delete_search_cutoff(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::SEARCH_CUTOFF) } + + pub fn embeddings( + &self, + rtxn: &RoTxn<'_>, + docid: DocumentId, + ) -> Result>> { + let mut res = BTreeMap::new(); + for row in self.embedder_category_id.iter(rtxn)? { + let (embedder_name, embedder_id) = row?; + let embedder_id = (embedder_id as u16) << 8; + let mut embeddings = Vec::new(); + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader?.item_vector(rtxn, docid)?; + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + if !embeddings.is_empty() { + res.insert(embedder_name.to_owned(), embeddings); + } + } + Ok(res) + } } #[cfg(test)] From 52d9cb6e5af5dcfe23638354f3b124a0371b007d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:42:26 +0200 Subject: [PATCH 30/56] Refactor vector indexing - use the parsed_vectors module - only parse `_vectors` once per document, instead of once per embedder per document --- milli/src/error.rs | 2 +- milli/src/lib.rs | 29 -- .../extract/extract_vector_points.rs | 373 +++++++++--------- .../src/update/index_documents/extract/mod.rs | 46 +-- milli/src/vector/mod.rs | 4 + 5 files changed, 218 insertions(+), 236 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 6db0dcac1..e60252ec1 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -120,7 +120,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco #[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")] InvalidVectorsType { document_id: Value, value: Value, subfield: String }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] - InvalidVectorsMapType { document_id: Value, value: Value }, + InvalidVectorsMapType { document_id: String, value: Value }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 881633b5c..f6b86f14a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -362,35 +362,6 @@ pub fn normalize_facet(original: &str) -> String { CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase() } -/// Represents either a vector or an array of multiple vectors. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -#[serde(transparent)] -pub struct VectorOrArrayOfVectors { - #[serde(with = "either::serde_untagged_optional")] - inner: Option, Vec>>>, -} - -impl VectorOrArrayOfVectors { - pub fn into_array_of_vectors(self) -> Option>> { - match self.inner? { - either::Either::Left(vector) => Some(vec![vector]), - either::Either::Right(vectors) => Some(vectors), - } - } -} - -/// Normalize a vector by dividing the dimensions by the length of it. -pub fn normalize_vector(mut vector: Vec) -> Vec { - let squared: f32 = vector.iter().map(|x| x * x).sum(); - let length = squared.sqrt(); - if length <= f32::EPSILON { - vector - } else { - vector.iter_mut().for_each(|x| *x /= length); - vector - } -} - #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 322fa3725..8b78a8c55 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -10,16 +10,16 @@ use bytemuck::cast_slice; use grenad::Writer; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; -use serde_json::{from_slice, Value}; +use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; -use crate::error::UserError; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::Embedder; -use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors}; +use crate::{DocumentId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -31,6 +31,10 @@ pub struct ExtractedVectorPoints { pub remove_vectors: grenad::Reader>, // docid -> prompt pub prompts: grenad::Reader>, + + // embedder + pub embedder_name: String, + pub embedder: Arc, } enum VectorStateDelta { @@ -65,6 +69,19 @@ impl VectorStateDelta { } } +struct EmbedderVectorExtractor { + embedder_name: String, + embedder: Arc, + prompt: Arc, + + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, + // (docid) -> (prompt) + prompts_writer: Writer>, + // (docid) -> () + remove_vectors_writer: Writer>, +} + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -72,35 +89,55 @@ impl VectorStateDelta { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - settings_diff: &InnerIndexSettingsDiff, - prompt: &Prompt, - embedder_name: &str, -) -> Result { + settings_diff: Arc, +) -> Result> { puffin::profile_function!(); + let reindex_vectors = settings_diff.reindex_vectors(); + let old_fields_ids_map = &settings_diff.old.fields_ids_map; let new_fields_ids_map = &settings_diff.new.fields_ids_map; + // the vector field id may have changed + let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); + // filter the old vector fid if the settings has been changed forcing reindexing. + let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); - // (docid, _index) -> KvWriterDelAdd -> Vector - let mut manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // (docid) -> (prompt) - let mut prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut extractors = Vec::new(); + for (embedder_name, (embedder, prompt)) in + settings_diff.new.embedding_configs.clone().into_iter() + { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - // (docid) -> () - let mut remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + }); + } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -114,152 +151,140 @@ pub fn extract_vector_points( key_buffer.clear(); key_buffer.extend_from_slice(docid_bytes); - // since we only needs the primary key when we throw an error we create this getter to + // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - // the vector field id may have changed - let old_vectors_fid = old_fields_ids_map.id("_vectors"); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors()); + let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; - let new_vectors_fid = new_fields_ids_map.id("_vectors"); - let vectors_field = { - let del = old_vectors_fid - .and_then(|vectors_fid| obkv.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id)) - .transpose()? - .flatten(); - let add = new_vectors_fid - .and_then(|vectors_fid| obkv.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id)) - .transpose()? - .flatten(); - (del, add) - }; + for EmbedderVectorExtractor { + embedder_name, + embedder: _, + prompt, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + } in extractors.iter_mut() + { + let delta = match parsed_vectors.remove(embedder_name) { + (Some(old), Some(new)) => { + // no autogeneration + let del_vectors = old.into_array_of_vectors(); + let add_vectors = new.into_array_of_vectors(); - let (del_map, add_map) = vectors_field; - - let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); - let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); - - let delta = match (del_value, add_value) { - (Some(old), Some(new)) => { - // no autogeneration - let del_vectors = extract_vectors(old, document_id, embedder_name)?; - let add_vectors = extract_vectors(new, document_id, embedder_name)?; - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(del_vectors, add_vectors) - } - (Some(_old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else { - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - // was possibly autogenerated, remove all vectors for that document - let add_vectors = extract_vectors(new, document_id, embedder_name)?; - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() - }); - let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); } - } else { - VectorStateDelta::NowRemoved - } - } - }; - // and we finally push the unique vectors into the writer - push_vectors_diff( - &mut remove_vectors_writer, - &mut prompts_writer, - &mut manual_vectors_writer, - &mut key_buffer, - delta, - settings_diff, - )?; + VectorStateDelta::ManualDelta(del_vectors, add_vectors) + } + (Some(_old), None) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + VectorStateDelta::NowRemoved + } + } + (None, Some(new)) => { + // was possibly autogenerated, remove all vectors for that document + let add_vectors = new.into_array_of_vectors(); + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::WasGeneratedNowManual(add_vectors) + } + (None, None) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt) + // TODO: this filter works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + .filter(|_| !settings_diff.reindex_vectors()) + .map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() + }); + let new_prompt = + prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + }; + + // and we finally push the unique vectors into the writer + push_vectors_diff( + remove_vectors_writer, + prompts_writer, + manual_vectors_writer, + &mut key_buffer, + delta, + reindex_vectors, + )?; + } } - Ok(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector - manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () - remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt - prompts: writer_into_reader(prompts_writer)?, - }) -} + ///// -fn to_vector_map( - obkv: KvReaderDelAdd, - side: DelAdd, - document_id: &impl Fn() -> Value, -) -> Result>> { - Ok(if let Some(value) = obkv.get(side) { - let Ok(value) = from_slice(value) else { - let value = from_slice(value).map_err(InternalError::SerdeJson)?; - return Err(crate::Error::UserError(UserError::InvalidVectorsMapType { - document_id: document_id(), - value, - })); - }; - Some(value) - } else { - None - }) + let mut results = Vec::new(); + + for EmbedderVectorExtractor { + embedder_name, + embedder, + prompt: _, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + } in extractors + { + results.push(ExtractedVectorPoints { + // docid, _index -> KvWriterDelAdd -> Vector + manual_vectors: writer_into_reader(manual_vectors_writer)?, + // docid -> () + remove_vectors: writer_into_reader(remove_vectors_writer)?, + // docid -> prompt + prompts: writer_into_reader(prompts_writer)?, + + embedder, + embedder_name, + }) + } + + Ok(results) } /// Computes the diff between both Del and Add numbers and @@ -270,14 +295,14 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - settings_diff: &InnerIndexSettingsDiff, + reindex_vectors: bool, ) -> Result<()> { puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. - && !settings_diff.reindex_vectors() + && !reindex_vectors { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; @@ -308,7 +333,7 @@ fn push_vectors_diff( EitherOrBoth::Left(vector) => { // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. - if !settings_diff.reindex_vectors() { + if !reindex_vectors { // We insert only the Del part of the Obkv to inform // that we only want to remove all those vectors. let mut obkv = KvWriterDelAdd::memory(); @@ -336,26 +361,6 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) } -/// Extracts the vectors from a JSON value. -fn extract_vectors( - value: Value, - document_id: impl Fn() -> Value, - name: &str, -) -> Result>> { - // FIXME: ugly clone of the vectors here - match serde_json::from_value(value.clone()) { - Ok(vectors) => { - Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default()) - } - Err(_) => Err(UserError::InvalidVectorsType { - document_id: document_id(), - value, - subfield: name.to_owned(), - } - .into()), - } -} - #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] pub fn extract_embeddings( // docid, prompt diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 573e0898a..0ea0fcc5c 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -226,27 +226,31 @@ fn send_original_documents_data( let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - let documents_chunk_cloned = original_documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - let request_threads = ThreadPoolNoAbortBuilder::new() .num_threads(crate::vector::REQUEST_PARALLELISM) .thread_name(|index| format!("embedding-request-{index}")) .build()?; - if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() { + let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) + // no point in indexing vectors without embedders + && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); + + if index_vectors { let settings_diff = settings_diff.clone(); + + let original_documents_chunk = original_documents_chunk.clone(); + let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() { - let result = extract_vector_points( - documents_chunk_cloned.clone(), - indexer, - &settings_diff, - &prompt, - &name, - ); - match result { - Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { + match extract_vector_points(original_documents_chunk.clone(), indexer, settings_diff) { + Ok(extracted_vectors) => { + for ExtractedVectorPoints { + manual_vectors, + remove_vectors, + prompts, + embedder_name, + embedder, + } in extracted_vectors + { let embeddings = match extract_embeddings( prompts, indexer, @@ -255,28 +259,26 @@ fn send_original_documents_data( ) { Ok(results) => Some(results), Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); + let _ = lmdb_writer_sx.send(Err(error)); None } }; - if !(remove_vectors.is_empty() && manual_vectors.is_empty() && embeddings.as_ref().map_or(true, |e| e.is_empty())) { - let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, embeddings, expected_dimension: embedder.dimensions(), manual_vectors, - embedder_name: name, + embedder_name, })); } } - - Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); - } + } + Err(error) => { + let _ = lmdb_writer_sx.send(Err(error)); } } }); diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index d3d05a1c1..1922bb389 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -148,6 +148,10 @@ impl EmbeddingConfigs { self.get(self.get_default_embedder_name()) } + pub fn inner_as_ref(&self) -> &HashMap, Arc)> { + &self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: From 02714ef5edb87f9fb371efeb2208526c2dbdb284 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:43:16 +0200 Subject: [PATCH 31/56] Add vectors from vector DB in dump --- index-scheduler/src/batch.rs | 54 ++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 582497c15..40398dc37 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -31,6 +31,7 @@ use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; +use meilisearch_types::milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; @@ -916,8 +917,57 @@ impl IndexScheduler { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } - let (_id, doc) = ret?; - let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + let (id, doc) = ret?; + + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + 'inject_vectors: { + let embeddings = index.embeddings(&rtxn, id)?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME.to_owned()) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(milli::Error::UserError( + milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&rtxn, std::iter::once(id)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={id}") + } + }, + value: vectors.clone(), + }, + ) + .into()); + }; + + /// some tests to consider: + /// + /// - dump, then import, then change a document with autogenerated vectors + for (embedder_name, embeddings) in embeddings { + // don't change the entry if it already exists, because it was user-provided + vectors.entry(embedder_name).or_insert_with(|| { + + let embeddings = milli::vector::parsed_vectors::ExplicitVectors { + embeddings: milli::vector::parsed_vectors::VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + user_provided: false, + }; + serde_json::to_value(embeddings).unwrap() + }); + } + } + index_dumper.push_document(&document)?; } From 2f7a8a4efb9248855a272aa7ec9e8d46a290a8f8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:46:04 +0200 Subject: [PATCH 32/56] Don't write vectors that weren't autogenerated in document DB --- .../src/update/index_documents/typed_chunk.rs | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e0de2d5a1..8eb9ead28 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -193,6 +193,10 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "documents"); let _entered = span.enter(); + let fields_ids_map = index.fields_ids_map(wtxn)?; + let vectors_fid = + fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); for typed_chunk in typed_chunks { let TypedChunk::Documents(chunk) = typed_chunk else { @@ -206,6 +210,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; + + let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); @@ -219,6 +225,24 @@ pub(crate) fn write_typed_chunk_into_index( let del_add_reader = KvReaderDelAdd::new(value); if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + let addition = match vectors_fid { + // for the "_vectors" field, only keep vectors that are marked as userProvided + Some(vectors_fid) if vectors_fid == field_id => 'vectors: { + vectors_buffer.clear(); + let Ok(mut vectors) = + crate::vector::parsed_vectors::ParsedVectors::from_bytes( + addition, + ) + else { + break 'vectors addition; + }; + vectors.retain_user_provided_vectors(); + serde_json::to_writer(&mut vectors_buffer, &vectors.0) + .map_err(InternalError::SerdeJson)?; + &vectors_buffer + } + _ => addition, + }; writer.insert(field_id, addition)?; } } From 0462ebbe582ee493d6d18d2ffdd6ac6a6761dcda Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:51:27 +0200 Subject: [PATCH 33/56] Don't write an empty _vectors field --- milli/src/update/index_documents/typed_chunk.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8eb9ead28..6f11dd585 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -234,16 +234,23 @@ pub(crate) fn write_typed_chunk_into_index( addition, ) else { - break 'vectors addition; + break 'vectors Some(addition); }; vectors.retain_user_provided_vectors(); - serde_json::to_writer(&mut vectors_buffer, &vectors.0) + let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; + if vectors.is_empty() { + break 'vectors None; + } + + serde_json::to_writer(&mut vectors_buffer, &vectors) .map_err(InternalError::SerdeJson)?; - &vectors_buffer + Some(vectors_buffer.as_slice()) } - _ => addition, + _ => Some(addition), }; - writer.insert(field_id, addition)?; + if let Some(addition) = addition { + writer.insert(field_id, addition)?; + } } } From d05d49ffd8d4c666de5a5528145d313d0fcb2430 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 14:14:02 +0200 Subject: [PATCH 34/56] Fix tests --- meilisearch/tests/search/mod.rs | 54 +++++++++---------- meilisearch/tests/search/multi.rs | 30 +++++------ .../src/update/index_documents/typed_chunk.rs | 2 + 3 files changed, 44 insertions(+), 42 deletions(-) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index f601e2b03..771eee21b 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -895,9 +895,9 @@ async fn test_score_details() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] }, "_rankingScoreDetails": { @@ -1096,9 +1096,9 @@ async fn experimental_feature_vector_store() { "id": "287947", "_vectors": { "manual": [ - 1, - 2, - 3 + 1.0, + 2.0, + 3.0 ] }, "_rankingScore": 1.0 @@ -1108,9 +1108,9 @@ async fn experimental_feature_vector_store() { "id": "299537", "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] }, "_rankingScore": 0.9129111766815186 @@ -1120,9 +1120,9 @@ async fn experimental_feature_vector_store() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] }, "_rankingScore": 0.8106412887573242 @@ -1132,9 +1132,9 @@ async fn experimental_feature_vector_store() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] }, "_rankingScore": 0.7412010431289673 @@ -1144,9 +1144,9 @@ async fn experimental_feature_vector_store() { "id": "522681", "_vectors": { "manual": [ - 10, - -23, - 32 + 10.0, + -23.0, + 32.0 ] }, "_rankingScore": 0.6972063183784485 @@ -1405,9 +1405,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } @@ -1426,9 +1426,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } @@ -1447,9 +1447,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } diff --git a/meilisearch/tests/search/multi.rs b/meilisearch/tests/search/multi.rs index aeec1bad4..b5cf8f476 100644 --- a/meilisearch/tests/search/multi.rs +++ b/meilisearch/tests/search/multi.rs @@ -75,9 +75,9 @@ async fn simple_search_single_index() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] } } @@ -96,9 +96,9 @@ async fn simple_search_single_index() { "id": "299537", "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] } } @@ -194,9 +194,9 @@ async fn simple_search_two_indexes() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] } } @@ -227,9 +227,9 @@ async fn simple_search_two_indexes() { "cattos": "pésti", "_vectors": { "manual": [ - 1, - 2, - 3 + 1.0, + 2.0, + 3.0 ] } }, @@ -249,9 +249,9 @@ async fn simple_search_two_indexes() { ], "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6f11dd585..6615a4bc3 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -234,11 +234,13 @@ pub(crate) fn write_typed_chunk_into_index( addition, ) else { + // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; vectors.retain_user_provided_vectors(); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { + // skip writing empty `_vectors` map break 'vectors None; } From 30cf972987327bfb349270bb371875d9180343be Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 May 2024 18:11:16 +0200 Subject: [PATCH 35/56] Add test with a dump --- dump/src/reader/mod.rs | 134 +++ ...__test__import_dump_v6_with_vectors-5.snap | 783 +++++++++++++++++ ...__test__import_dump_v6_with_vectors-6.snap | 786 ++++++++++++++++++ ...__test__import_dump_v6_with_vectors-7.snap | 785 +++++++++++++++++ ...__test__import_dump_v6_with_vectors-8.snap | 780 +++++++++++++++++ dump/tests/assets/v6-with-vectors.dump | Bin 0 -> 17539 bytes 6 files changed, 3268 insertions(+) create mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap create mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap create mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap create mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap create mode 100644 dump/tests/assets/v6-with-vectors.dump diff --git a/dump/src/reader/mod.rs b/dump/src/reader/mod.rs index 5bbf4ec4d..2b3732164 100644 --- a/dump/src/reader/mod.rs +++ b/dump/src/reader/mod.rs @@ -197,6 +197,140 @@ pub(crate) mod test { use super::*; use crate::reader::v6::RuntimeTogglableFeatures; + #[test] + fn import_dump_v6_with_vectors() { + // dump containing two indexes + // + // "vector", configured with an embedder + // contains: + // - one document with an overriden vector, + // - one document with a natural vector + // - one document with a _vectors map containing one additional embedder name and a natural vector + // - one document with a _vectors map containing one additional embedder name and an overriden vector + // + // "novector", no embedder + // contains: + // - a document without vector + // - a document with a random _vectors field + let dump = File::open("tests/assets/v6-with-vectors.dump").unwrap(); + let mut dump = DumpReader::open(dump).unwrap(); + + // top level infos + insta::assert_display_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00"); + insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + + // tasks + let tasks = dump.tasks().unwrap().collect::>>().unwrap(); + let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); + meili_snap::snapshot_hash!(meili_snap::json_string!(tasks), @"278f63325ef06ca04d01df98d8207b94"); + assert_eq!(update_files.len(), 10); + assert!(update_files[0].is_none()); // the dump creation + assert!(update_files[1].is_none()); + assert!(update_files[2].is_none()); + assert!(update_files[3].is_none()); + assert!(update_files[4].is_none()); + assert!(update_files[5].is_none()); + assert!(update_files[6].is_none()); + assert!(update_files[7].is_none()); + assert!(update_files[8].is_none()); + assert!(update_files[9].is_none()); + + // indexes + let mut indexes = dump.indexes().unwrap().collect::>>().unwrap(); + // the index are not ordered in any way by default + indexes.sort_by_key(|index| index.metadata().uid.to_string()); + + let mut vector_index = indexes.pop().unwrap(); + let mut novector_index = indexes.pop().unwrap(); + assert!(indexes.is_empty()); + + // vector + + insta::assert_json_snapshot!(vector_index.metadata(), @r###" + { + "uid": "vector", + "primaryKey": "id", + "createdAt": "2024-05-16T15:33:17.240962Z", + "updatedAt": "2024-05-16T15:40:55.723052Z" + } + "###); + + { + let documents: Result> = vector_index.documents().unwrap().collect(); + let mut documents = documents.unwrap(); + assert_eq!(documents.len(), 4); + + documents.sort_by_key(|doc| doc.get("id").unwrap().to_string()); + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + } + + // novector + + insta::assert_json_snapshot!(novector_index.metadata(), @r###" + { + "uid": "novector", + "primaryKey": "id", + "createdAt": "2024-05-16T15:33:03.568055Z", + "updatedAt": "2024-05-16T15:33:07.530217Z" + } + "###); + + insta::assert_json_snapshot!(novector_index.settings().unwrap().embedders, @"null"); + + { + let documents: Result> = novector_index.documents().unwrap().collect(); + let mut documents = documents.unwrap(); + assert_eq!(documents.len(), 2); + + documents.sort_by_key(|doc| doc.get("id").unwrap().to_string()); + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document, @r###" + { + "id": "e1", + "other": "random1", + "_vectors": "toto" + } + "###); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document, @r###" + { + "id": "e0", + "other": "random0" + } + "###); + } + } + + assert_eq!( + dump.features().unwrap().unwrap(), + RuntimeTogglableFeatures { vector_store: true, ..Default::default() } + ); + } + #[test] fn import_dump_v6_experimental() { let dump = File::open("tests/assets/v6-with-experimental.dump").unwrap(); diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap new file mode 100644 index 000000000..43bdb9726 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap @@ -0,0 +1,783 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e3", + "desc": "overriden vector + map", + "_vectors": { + "default": [ + 0.2, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1 + ], + "toto": [ + 0.1 + ] + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap new file mode 100644 index 000000000..0aad0ea97 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -0,0 +1,786 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e2", + "desc": "natural vector + map", + "_vectors": { + "toto": [], + "default": { + "embeddings": [ + [ + -0.05189208313822746, + -0.9273212552070618, + 0.1443813145160675, + 0.0932632014155388, + 0.2665371894836426, + 0.36266782879829407, + 0.6402910947799683, + 0.32014018297195435, + 0.030915971845388412, + -0.9312191605567932, + -0.3718109726905823, + -0.2700554132461548, + -1.1014580726623535, + 0.9154956936836244, + -0.3406888246536255, + 1.0077725648880005, + 0.6577560901641846, + -0.3955195546150207, + -0.4148270785808563, + 0.1855088472366333, + 0.5062315464019775, + -0.3632686734199524, + -0.2277890294790268, + 0.2560805082321167, + -0.3853609561920166, + -0.1604762226343155, + -0.13947471976280212, + -0.20147813856601715, + -0.4466346800327301, + -0.3761846721172333, + 0.1443382054567337, + 0.18205296993255615, + 0.49359792470932007, + -0.22538000345230105, + -0.4996317625045776, + -0.22734887897968292, + -0.6034309267997742, + -0.7857939600944519, + -0.34923747181892395, + -0.3466345965862274, + 0.21176661550998688, + -0.5101462006568909, + -0.3403083384037018, + 0.000315118464641273, + 0.236465722322464, + -0.10246097296476364, + -1.3013339042663574, + 0.3419138789176941, + -0.32963496446609497, + -0.0901619717478752, + -0.5426247119903564, + 0.22656650841236117, + -0.44758284091949463, + 0.14151698350906372, + -0.1089438870549202, + 0.5500766634941101, + -0.670711100101471, + -0.6227269768714905, + 0.3894464075565338, + -0.27609574794769287, + 0.7028202414512634, + -0.19697771966457367, + 0.328511506319046, + 0.5063360929489136, + 0.4065195322036743, + 0.2614171802997589, + -0.30274391174316406, + 1.0393824577331543, + -0.7742937207221985, + -0.7874112129211426, + -0.6749666929244995, + 0.5190866589546204, + 0.004123548045754433, + -0.28312963247299194, + -0.038731709122657776, + -1.0142987966537476, + -0.09519586712121964, + 0.8755272626876831, + 0.4876938760280609, + 0.7811151742935181, + 0.85174959897995, + 0.11826585978269576, + 0.5373436808586121, + 0.3649002015590668, + 0.19064077734947205, + -0.00287026260048151, + -0.7305403351783752, + -0.015206154435873032, + -0.7899249196052551, + 0.19407285749912265, + 0.08596625179052353, + -0.28976231813430786, + -0.1525907665491104, + 0.3798313438892365, + 0.050306469202041626, + -0.5697937607765198, + 0.4219021201133728, + 0.276252806186676, + 0.1559903472661972, + 0.10030482709407806, + -0.4043720066547394, + -0.1969818025827408, + 0.5739826560020447, + 0.2116064727306366, + -1.4620544910430908, + -0.7802462577819824, + -0.24739810824394223, + -0.09791352599859238, + -0.4413802027702331, + 0.21549351513385773, + -0.9520436525344848, + -0.08762510865926743, + 0.08154498040676117, + -0.6154940724372864, + -1.01079523563385, + 0.885427713394165, + 0.6967288851737976, + 0.27186504006385803, + -0.43194177746772766, + -0.11248451471328735, + 0.7576630711555481, + 0.4998855590820313, + 0.0264343973249197, + 0.9872855544090272, + 0.5634694695472717, + 0.053698331117630005, + 0.19410227239131927, + 0.3570743501186371, + -0.23670297861099243, + -0.9114483594894408, + 0.07884842902421951, + 0.7318344116210938, + 0.44630110263824463, + 0.08745364099740982, + -0.347101628780365, + -0.4314247667789459, + -0.5060274004936218, + 0.003706763498485088, + 0.44320008158683777, + -0.00788921769708395, + -0.1368623524904251, + -0.17391923069953918, + 0.14473655819892883, + 0.10927865654230118, + 0.6974599361419678, + 0.005052129738032818, + -0.016953065991401672, + -0.1256176233291626, + -0.036742497235536575, + 0.5591985583305359, + -0.37619709968566895, + 0.22429119050502777, + 0.5403043031692505, + -0.8603790998458862, + -0.3456307053565979, + 0.9292937517166138, + 0.5074859261512756, + 0.6310645937919617, + -0.3091641068458557, + 0.46902573108673096, + 0.7891915440559387, + 0.4499550759792328, + 0.2744995653629303, + 0.2712305784225464, + -0.04349074140191078, + -0.3638863265514374, + 0.7839881777763367, + 0.7352104783058167, + -0.19457511603832245, + -0.5957832932472229, + -0.43704694509506226, + -1.084769368171692, + 0.4904985725879669, + 0.5385226011276245, + 0.1891629993915558, + 0.12338479608297348, + 0.8315675258636475, + -0.07830192148685455, + 1.0916285514831543, + -0.28066861629486084, + -1.3585069179534912, + 0.5203898549079895, + 0.08678033947944641, + -0.2566044330596924, + 0.09484415501356123, + -0.0180208683013916, + 1.0264745950698853, + -0.023572135716676712, + 0.5864979028701782, + 0.7625196576118469, + -0.2543414533138275, + -0.8877770900726318, + 0.7611982822418213, + -0.06220436468720436, + 0.937336564064026, + 0.2704363465309143, + -0.37733694911003113, + 0.5076137781143188, + -0.30641937255859375, + 0.6252772808074951, + -0.0823579877614975, + -0.03736555948853493, + 0.4131673276424408, + -0.6514252424240112, + 0.12918265163898468, + -0.4483584463596344, + 0.6750786304473877, + -0.37008383870124817, + -0.02324833907186985, + 0.38027650117874146, + -0.26374951004981995, + 0.4346931278705597, + 0.42882832884788513, + -0.48798441886901855, + 1.1882442235946655, + 0.5132288336753845, + 0.5284568667411804, + -0.03538886830210686, + 0.29620853066444397, + -1.0683696269989014, + 0.25936177372932434, + 0.10404160618782043, + -0.25796034932136536, + 0.027896970510482788, + -0.09225251525640488, + 1.4811025857925415, + 0.641173779964447, + -0.13838383555412292, + -0.3437179923057556, + 0.5667019486427307, + -0.5400741696357727, + 0.31090837717056274, + 0.6470608115196228, + -0.3747067153453827, + -0.7364534735679626, + -0.07431528717279434, + 0.5173454880714417, + -0.6578747034072876, + 0.7107478976249695, + -0.7918999791145325, + -0.0648345872759819, + 0.609937846660614, + -0.7329513430595398, + 0.9741371870040894, + 0.17912346124649048, + -0.02658769302070141, + 0.5162150859832764, + -0.3978803157806397, + -0.7833885550498962, + -0.6497276425361633, + -0.3898126780986786, + -0.0952848568558693, + 0.2663288116455078, + -0.1604052186012268, + 0.373076468706131, + -0.8357769250869751, + -0.05217683315277099, + -0.2680160701274872, + 0.8389158248901367, + 0.6833611130714417, + -0.6712407469749451, + 0.7406917214393616, + -0.44522786140441895, + -0.34645363688468933, + -0.27384576201438904, + -0.9878405928611756, + -0.8166060447692871, + 0.06268279999494553, + 0.38567957282066345, + -0.3274703919887543, + 0.5296315550804138, + -0.11810623109340668, + 0.23029841482639313, + 0.08616159111261368, + -0.2195747196674347, + 0.09430307894945145, + 0.4057176411151886, + 0.4892159104347229, + -0.1636916548013687, + -0.6071445345878601, + 0.41256585717201233, + 0.622254490852356, + -0.41223976016044617, + -0.6686707139015198, + -0.7474371790885925, + -0.8509522080421448, + -0.16754287481307983, + -0.9078601002693176, + -0.29653599858283997, + -0.5020652413368225, + 0.4692700505256653, + 0.01281109917908907, + -0.16071580350399017, + 0.03388889133930206, + -0.020511148497462273, + 0.5027827024459839, + -0.20729811489582065, + 0.48107290267944336, + 0.33669769763946533, + -0.5275911688804626, + 0.48271527886390686, + 0.2738940715789795, + -0.033152539283037186, + -0.13629786670207977, + -0.05965912342071533, + -0.26200807094573975, + 0.04002794995903969, + -0.34095603227615356, + -3.986898899078369, + -0.46819332242012024, + -0.422744482755661, + -0.169097900390625, + 0.6008929014205933, + 0.058016058057546616, + -0.11401277780532836, + -0.3077819049358368, + -0.09595538675785063, + 0.6723822355270386, + 0.19367831945419312, + 0.28304359316825867, + 0.1609862744808197, + 0.7567598819732666, + 0.6889985799789429, + 0.06907720118761063, + -0.04188092052936554, + -0.7434936165809631, + 0.13321782648563385, + 0.8456063270568848, + -0.10364038497209548, + -0.45084846019744873, + -0.4758241474628449, + 0.43882066011428833, + -0.6432598829269409, + 0.7217311859130859, + -0.24189773201942444, + 0.12737572193145752, + -1.1008601188659668, + -0.3305315673351288, + 0.14614742994308472, + -0.7819333076477051, + 0.5287120342254639, + -0.055538054555654526, + 0.1877404749393463, + -0.6907662153244019, + 0.5616975426673889, + -0.4611121714115143, + -0.26109233498573303, + -0.12898315489292145, + -0.3724522292613983, + -0.7191406488418579, + -0.4425233602523804, + -0.644108235836029, + 0.8424481153488159, + 0.17532426118850708, + -0.5121750235557556, + -0.6467239260673523, + -0.0008507720194756985, + 0.7866212129592896, + -0.02644744887948036, + -0.005045140627771616, + 0.015782782807946205, + 0.16334445774555206, + -0.1913367658853531, + -0.13697923719882965, + -0.6684983372688293, + 0.18346354365348816, + -0.341105580329895, + 0.5427411198616028, + 0.3779832422733307, + -0.6778115034103394, + -0.2931850254535675, + -0.8805161714553833, + -0.4212774932384491, + -0.5368952751159668, + -1.3937891721725464, + -1.225494146347046, + 0.4276703894138336, + 1.1205668449401855, + -0.6005299687385559, + 0.15732505917549133, + -0.3914784789085388, + -1.357046604156494, + -0.4707142114639282, + -0.1497287154197693, + -0.25035548210144043, + -0.34328439831733704, + 0.39083412289619446, + 0.1623048633337021, + -0.9275814294815063, + -0.6430015563964844, + 0.2973862886428833, + 0.5580436587333679, + -0.6232585310935974, + -0.6611042022705078, + 0.4015969038009643, + -1.0232892036437988, + -0.2585645020008087, + -0.5431421399116516, + 0.5021264553070068, + -0.48601630330085754, + -0.010242084041237833, + 0.5862035155296326, + 0.7316920161247253, + 0.4036808013916016, + 0.4269520044326782, + -0.705938458442688, + 0.7747307419776917, + 0.10164368897676468, + 0.7887958884239197, + -0.9612497091293336, + 0.12755516171455383, + 0.06812842190265656, + -0.022603651508688927, + 0.14722754061222076, + -0.5588505268096924, + -0.20689940452575684, + 0.3557641804218292, + -0.6812759637832642, + 0.2860803008079529, + -0.38954633474349976, + 0.1759403496980667, + -0.5678874850273132, + -0.1692986786365509, + -0.14578519761562347, + 0.5711379051208496, + 1.0208125114440918, + 0.7759483456611633, + -0.372348427772522, + -0.5460885763168335, + 0.7190321683883667, + -0.6914990544319153, + 0.13365162909030914, + -0.4854792356491089, + 0.4054908752441406, + 0.4502798914909363, + -0.3041122555732727, + -0.06726965308189392, + -0.05570871382951737, + -0.0455719493329525, + 0.4785125255584717, + 0.8867972493171692, + 0.4107886850833893, + 0.6121342182159424, + -0.20477132499217987, + -0.5598517656326294, + -0.6443566679954529, + -0.5905212759971619, + -0.5571200251579285, + 0.17573799192905426, + -0.28621870279312134, + 0.1685224026441574, + 0.09719007462263109, + -0.04223639518022537, + -0.28623101115226746, + -0.1449810117483139, + -0.3789580464363098, + -0.5227636098861694, + -0.049728814512491226, + 0.7849089503288269, + 0.16792525351047516, + 0.9849340915679932, + -0.6559549570083618, + 0.35723909735679626, + -0.6822739243507385, + 1.2873116731643677, + 0.19993330538272855, + 0.03512010723352432, + -0.6972134113311768, + 0.18453484773635864, + -0.2437680810689926, + 0.2156416028738022, + 0.5230382680892944, + 0.22020135819911957, + 0.8314080238342285, + 0.15627102553844452, + -0.7330264449119568, + 0.3888184726238251, + -0.22034703195095065, + 0.5457669496536255, + -0.48084837198257446, + -0.45576658844947815, + -0.09287727624177931, + -0.06968110054731369, + 0.35125672817230225, + -0.4278119504451752, + 0.2038476765155792, + 0.11392722278833388, + 0.9433983564376832, + -0.4097744226455689, + 0.035297419875860214, + -0.4274404048919678, + -0.25100165605545044, + 1.0943366289138794, + -0.07634022831916809, + -0.2925529479980469, + -0.7512530088424683, + 0.2649727463722229, + -0.4078235328197479, + -0.3372223973274231, + 0.05190162733197212, + 0.005654910113662481, + -0.0001571219472680241, + -0.35445958375930786, + -0.7837416529655457, + 0.1500556766986847, + 0.4383024573326111, + 0.6099548935890198, + 0.05951934307813645, + -0.21325334906578064, + 0.0199207104742527, + -0.22704418003559113, + -0.6481077671051025, + 0.37442275881767273, + -1.015955924987793, + 0.38637226819992065, + -0.06489371508359909, + -0.494120329618454, + 0.3469836115837097, + 0.15402406454086304, + -0.7660972476005554, + -0.7053225040435791, + -0.25964751839637756, + 0.014004424214363098, + -0.2860170006752014, + -0.17565494775772095, + -0.45117494463920593, + -0.0031954257283359766, + 0.09676837921142578, + -0.514464259147644, + 0.41698193550109863, + -0.21642713248729703, + -0.5398141145706177, + -0.3647628426551819, + 0.37005379796028137, + 0.239425927400589, + -0.08833975344896317, + 0.934946596622467, + -0.48340797424316406, + 0.6241437792778015, + -0.7253676652908325, + -0.04303571209311485, + 1.1125205755233765, + -0.15692919492721558, + -0.2914651036262512, + -0.5117168426513672, + 0.21365483105182648, + 0.4924402534961701, + 0.5269662141799927, + 0.0352792888879776, + -0.149167999625206, + -0.6019760370254517, + 0.08245442807674408, + 0.4900692105293274, + 0.518824577331543, + -0.00005570516441366635, + -0.553304135799408, + 0.22217543423175812, + 0.5047767758369446, + 0.135724738240242, + 1.1511540412902832, + -0.3541218340396881, + -0.9712511897087096, + 0.8353699445724487, + -0.39227569103240967, + -0.9117669463157654, + -0.26349931955337524, + 0.05597023293375969, + 0.20695461332798004, + 0.3178807199001312, + 1.0663238763809204, + 0.5062212347984314, + 0.7288597822189331, + 0.09899299591779707, + 0.553720235824585, + 0.675009548664093, + -0.20067055523395536, + 0.3138423264026642, + -0.6886593103408813, + -0.2910398542881012, + -1.3186300992965698, + -0.4684459865093231, + -0.095743365585804, + -0.1257995069026947, + -0.4858281314373016, + -0.4935407340526581, + -0.3266896903514862, + -0.3928797245025635, + -0.40803104639053345, + -0.9975396394729614, + 0.4229583740234375, + 0.37309643626213074, + 0.4431034922599793, + 0.30364808440208435, + -0.3765178918838501, + 0.5616499185562134, + 0.16904796659946442, + -0.7343707084655762, + 0.2560209631919861, + 0.6166825294494629, + 0.3200829327106476, + -0.4483652710914612, + 0.16224201023578644, + -0.31495288014411926, + -0.42713335156440735, + 0.7270734906196594, + 0.7049484848976135, + -0.0571461021900177, + 0.04477125033736229, + -0.6647796034812927, + 1.183672308921814, + 0.36199676990509033, + 0.046881116926670074, + 0.4515796303749085, + 0.9278061985969543, + 0.31471705436706543, + -0.7073333859443665, + -0.3443860113620758, + 0.5440067052841187, + -0.15020819008350372, + -0.541202962398529, + 0.5203295946121216, + 1.2192286252975464, + -0.9983593225479126, + -0.18758884072303772, + 0.2758221924304962, + -0.6511523723602295, + -0.1584404855966568, + -0.236241415143013, + 0.2692437767982483, + -0.4941152036190033, + 0.4987454116344452, + -0.3331359028816223, + 0.3163745701313019, + 0.745529294013977, + -0.2905873656272888, + 0.13602906465530396, + 0.4679684340953827, + 1.0555986166000366, + 1.075700044631958, + 0.5368486046791077, + -0.5118206739425659, + 0.8668332099914551, + -0.5726966857910156, + -0.7811751961708069, + 0.1938626915216446, + -0.1929349899291992, + 0.1757766306400299, + 0.6384295225143433, + 0.26462844014167786, + 0.9542630314826964, + 0.19313029944896695, + 1.264248013496399, + -0.6304428577423096, + 0.0487106591463089, + -0.16211535036563873, + -0.7894763350486755, + 0.3582514822483063, + -0.04153040423989296, + 0.635784387588501, + 0.6554391980171204, + -0.47010496258735657, + -0.8302040696144104, + -0.1350124627351761, + 0.2568812072277069, + 0.13614831864833832, + -0.2563649117946625, + -1.0434694290161133, + 0.3232482671737671, + 0.47882452607154846, + 0.4298652410507202, + 1.0563770532608032, + -0.28917592763900757, + -0.8533256649971008, + 0.10648339986801147, + 0.6376127004623413, + -0.20832888782024384, + 0.2370245456695557, + 0.0018312990432605147, + -0.2034837007522583, + 0.01051164511591196, + -1.105310082435608, + 0.29724350571632385, + 0.15604574978351593, + 0.1973688006401062, + 0.44394731521606445, + 0.3974513411521912, + -0.13625948131084442, + 0.9571986198425292, + 0.2257384955883026, + 0.2323588728904724, + -0.5583669543266296, + -0.7854922413825989, + 0.1647188365459442, + -1.6098142862319946, + 0.318587988615036, + -0.13399995863437653, + -0.2172701060771942, + -0.767514705657959, + -0.5813586711883545, + -0.3195130527019501, + -0.04894036799669266, + 0.2929930090904236, + -0.8213384747505188, + 0.07181350141763687, + 0.7469993829727173, + 0.6407455801963806, + 0.16365697979927063, + 0.7870153188705444, + 0.6524736881256104, + 0.6399973630905151, + -0.04992736503481865, + -0.03959266096353531, + -0.2512352466583252, + 0.8448855876922607, + -0.1422702670097351, + 0.1216789186000824, + -1.2647287845611572, + 0.5931149125099182, + 0.7186052203178406, + -0.06118432432413101, + -1.1942816972732544, + -0.17677085101604462, + 0.31543800234794617, + -0.32252824306488037, + 0.8255583047866821, + -0.14529970288276672, + -0.2695446312427521, + -0.33378756046295166, + -0.1653425395488739, + 0.1454019844532013, + -0.3920115828514099, + 0.912214994430542, + -0.7279734015464783, + 0.7374742031097412, + 0.933980405330658, + 0.13429680466651917, + -0.514870285987854, + 0.3989711999893189, + -0.11613689363002776, + 0.4022413492202759, + -0.9990655779838562, + -0.33749932050704956, + -0.4334589838981629, + -1.376373291015625, + -0.2993924915790558, + -0.09454808384180068, + -0.01314175222069025, + -0.001090060803107917, + 0.2137461006641388, + 0.2938512861728668, + 0.17508235573768616, + 0.8260607123374939, + -0.7218498587608337, + 0.2414487451314926, + -0.47296759486198425, + -0.3002610504627228, + -1.238540768623352, + 0.08663805574178696, + 0.6805586218833923, + 0.5909030437469482, + -0.42807504534721375, + -0.22887496650218964, + 0.47537800669670105, + -1.0474627017974854, + 0.6338009238243103, + 0.06548397243022919, + 0.4971011281013489, + 1.3484878540039063 + ] + ], + "userProvided": false + } + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap new file mode 100644 index 000000000..f2a5e1d69 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -0,0 +1,785 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e1", + "desc": "natural vector", + "_vectors": { + "default": { + "embeddings": [ + [ + -0.2979458272457123, + -0.5288640856742859, + -0.019957859069108963, + -0.18495318293571472, + 0.7429973483085632, + 0.5238497257232666, + 0.432366281747818, + 0.32744166254997253, + 0.0020762972999364138, + -0.9507834911346436, + -0.35097137093544006, + 0.08469701558351517, + -1.4176613092422483, + 0.4647577106952667, + -0.69340580701828, + 1.0372896194458008, + 0.3716741800308227, + 0.06031008064746857, + -0.6152024269104004, + 0.007914665155112743, + 0.7954924702644348, + -0.20773003995418549, + 0.09376765787601472, + 0.04508133605122566, + -0.2084471583366394, + -0.1518009901046753, + 0.018195509910583496, + -0.07044368237257004, + -0.18119366466999057, + -0.4480230510234833, + 0.3822529911994934, + 0.1911812424659729, + 0.4674372375011444, + 0.06963984668254852, + -0.09341949224472046, + 0.005675444379448891, + -0.6774799227714539, + -0.7066726684570313, + -0.39256376028060913, + 0.04005039855837822, + 0.2084812968969345, + -0.7872875928878784, + -0.8205880522727966, + 0.2919981777667999, + -0.06004738807678223, + -0.4907574355602264, + -1.5937862396240234, + 0.24249385297298431, + -0.14709846675395966, + -0.11860740929841997, + -0.8299489617347717, + 0.472964346408844, + -0.497518390417099, + -0.22205302119255063, + -0.4196169078350067, + 0.32697558403015137, + -0.360930860042572, + -0.9789686799049376, + 0.1887447088956833, + -0.403737336397171, + 0.18524253368377688, + 0.3768732249736786, + 0.3666233420372009, + 0.3511938452720642, + 0.6985810995101929, + 0.41721710562705994, + 0.09754953533411026, + 0.6204307079315186, + -1.0762996673583984, + -0.06263761967420578, + -0.7376511693000793, + 0.6849768161773682, + -0.1745152473449707, + -0.40449759364128113, + 0.20757411420345304, + -0.8424443006515503, + 0.330015629529953, + 0.3489064872264862, + 1.0954371690750122, + 0.8487558960914612, + 1.1076823472976685, + 0.61430823802948, + 0.4155903458595276, + 0.4111340939998626, + 0.05753209814429283, + -0.06429877132177353, + -0.765606164932251, + -0.41703930497169495, + -0.508820652961731, + 0.19859947264194489, + -0.16607828438282013, + -0.28112146258354187, + 0.11032675206661224, + 0.38809511065483093, + -0.36498191952705383, + -0.48671194911003113, + 0.6755134463310242, + 0.03958442434668541, + 0.4478721618652344, + -0.10335399955511092, + -0.9546685814857484, + -0.6087718605995178, + 0.17498846352100372, + 0.08320838958024979, + -1.4478336572647097, + -0.605027437210083, + -0.5867993235588074, + -0.14711688458919525, + -0.5447602272033691, + -0.026259321719408035, + -0.6997418403625488, + -0.07349082082509995, + 0.10638900846242905, + -0.7133527398109436, + -0.9396815299987792, + 1.087092399597168, + 1.1885089874267578, + 0.4011896848678589, + -0.4089202582836151, + -0.10938972979784012, + 0.6726722121238708, + 0.24576938152313232, + -0.24247920513153076, + 1.1499971151351929, + 0.47813335061073303, + -0.05331678315997124, + 0.32338133454322815, + 0.4870913326740265, + -0.23144258558750153, + -1.2023426294326782, + 0.2349330335855484, + 1.080536961555481, + 0.29334118962287903, + 0.391574501991272, + -0.15818795561790466, + -0.2948290705680847, + -0.024689948186278343, + 0.06602869182825089, + 0.5937030911445618, + -0.047901444137096405, + -0.512734591960907, + -0.35780075192451477, + 0.28751692175865173, + 0.4298716187477112, + 0.9242428541183472, + -0.17208744585514069, + 0.11515070497989656, + -0.0335976779460907, + -0.3422986567020416, + 0.5344581604003906, + 0.19895796477794647, + 0.33001241087913513, + 0.6390730142593384, + -0.6074934005737305, + -0.2553696632385254, + 0.9644920229911804, + 0.2699219584465027, + 0.6403993368148804, + -0.6380003690719604, + -0.027310986071825027, + 0.638815701007843, + 0.27719101309776306, + -0.13553589582443237, + 0.750195324420929, + 0.1224869191646576, + -0.20613941550254825, + 0.8444448709487915, + 0.16200250387191772, + -0.24750925600528717, + -0.739950954914093, + -0.28443849086761475, + -1.176282525062561, + 0.516107976436615, + 0.3774825632572174, + 0.10906043648719788, + 0.07962015271186829, + 0.7384604215621948, + -0.051241904497146606, + 1.1730090379714966, + -0.4828610122203827, + -1.404372215270996, + 0.8811132311820984, + -0.3839482367038727, + 0.022516896948218346, + -0.0491158664226532, + -0.43027013540267944, + 1.2049334049224854, + -0.27309560775756836, + 0.6883630752563477, + 0.8264574408531189, + -0.5020735263824463, + -0.4874092042446137, + 0.6007202863693237, + -0.4965405762195587, + 1.1302915811538696, + 0.032572727650403976, + -0.3731859028339386, + 0.658271849155426, + -0.9023059010505676, + 0.7400162220001221, + 0.014550759457051754, + -0.19699542224407196, + 0.2319706380367279, + -0.789058268070221, + -0.14905710518360138, + -0.5826214551925659, + 0.207652747631073, + -0.4507439732551574, + -0.3163885474205017, + 0.3604124188423157, + -0.45119962096214294, + 0.3428427278995514, + 0.3005594313144684, + -0.36026081442832947, + 1.1014249324798584, + 0.40884315967559814, + 0.34991952776908875, + -0.1806638240814209, + 0.27440476417541504, + -0.7118373513221741, + 0.4645499587059021, + 0.214790478348732, + -0.2343102991580963, + 0.10500429570674896, + -0.28034430742263794, + 1.2267805337905884, + 1.0561333894729614, + -0.497364342212677, + -0.6143305897712708, + 0.24963727593421936, + -0.33136463165283203, + -0.01473914459347725, + 0.495918869972229, + -0.6985538005828857, + -1.0033197402954102, + 0.35937801003456116, + 0.6325868368148804, + -0.6808838844299316, + 1.0354058742523191, + -0.7214401960372925, + -0.33318862318992615, + 0.874398410320282, + -0.6594992280006409, + 0.6830640435218811, + -0.18534131348133087, + 0.024834271520376205, + 0.19901277124881744, + -0.5992477536201477, + -1.2126628160476685, + -0.9245557188987732, + -0.3898217976093292, + -0.1286519467830658, + 0.4217943847179413, + -0.1143646091222763, + 0.5630772709846497, + -0.5240639448165894, + 0.21152715384960177, + -0.3792001008987427, + 0.8266305327415466, + 1.170984387397766, + -0.8072142004966736, + 0.11382893472909927, + -0.17953898012638092, + -0.1789460331201553, + -0.15078622102737427, + -1.2082908153533936, + -0.7812382578849792, + -0.10903695970773696, + 0.7303897142410278, + -0.39054441452026367, + 0.19511254131793976, + -0.09121843427419662, + 0.22400228679180145, + 0.30143046379089355, + 0.1141919493675232, + 0.48112115263938904, + 0.7307931780815125, + 0.09701362252235413, + -0.2795647978782654, + -0.3997688889503479, + 0.5540812611579895, + 0.564578115940094, + -0.40065160393714905, + -0.3629159033298493, + -0.3789091110229492, + -0.7298538088798523, + -0.6996853351593018, + -0.4477842152118683, + -0.289089560508728, + -0.6430277824401855, + 0.2344944179058075, + 0.3742927014827728, + -0.5079357028007507, + 0.28841453790664673, + 0.06515737622976303, + 0.707315981388092, + 0.09498685598373412, + 0.8365515470504761, + 0.10002726316452026, + -0.7695478200912476, + 0.6264724135398865, + 0.7562043070793152, + -0.23112858831882477, + -0.2871039807796478, + -0.25010058283805847, + 0.2783474028110504, + -0.03224996477365494, + -0.9119359850883484, + -3.6940200328826904, + -0.5099936127662659, + -0.1604711413383484, + 0.17453284561634064, + 0.41759559512138367, + 0.1419190913438797, + -0.11362407356500626, + -0.33312007784843445, + 0.11511333286762238, + 0.4667884409427643, + -0.0031647447030991316, + 0.15879854559898376, + 0.3042248487472534, + 0.5404849052429199, + 0.8515422344207764, + 0.06286454200744629, + 0.43790125846862793, + -0.8682025074958801, + -0.06363756954669952, + 0.5547921657562256, + -0.01483887154608965, + -0.07361344993114471, + -0.929947018623352, + 0.3502565622329712, + -0.5080993175506592, + 1.0380364656448364, + -0.2017953395843506, + 0.21319580078125, + -1.0763001441955566, + -0.556368887424469, + 0.1949922740459442, + -0.6445739269256592, + 0.6791343688964844, + 0.21188358962535855, + 0.3736183941364288, + -0.21800459921360016, + 0.7597446441650391, + -0.3732394874095917, + -0.4710160195827484, + 0.025146087631583217, + 0.05341297015547752, + -0.9522109627723694, + -0.6000866889953613, + -0.08469046652317047, + 0.5966026186943054, + 0.3444081246852875, + -0.461188405752182, + -0.5279349088668823, + 0.10296865552663804, + 0.5175143480300903, + -0.20671147108078003, + 0.13392412662506104, + 0.4812754988670349, + 0.2993808686733246, + -0.3005635440349579, + 0.5141698122024536, + -0.6239235401153564, + 0.2877119481563568, + -0.4452739953994751, + 0.5621107816696167, + 0.5047508478164673, + -0.4226335883140564, + -0.18578553199768064, + -1.1967322826385498, + 0.28178197145462036, + -0.8692031502723694, + -1.1812998056411743, + -1.4526212215423584, + 0.4645712077617645, + 0.9327932000160216, + -0.6560136675834656, + 0.461549699306488, + -0.5621527433395386, + -1.328449010848999, + -0.08676894754171371, + 0.00021918353741057217, + -0.18864136934280396, + 0.1259666532278061, + 0.18240638077259064, + -0.14919660985469818, + -0.8965857625007629, + -0.7539900541305542, + 0.013973715715110302, + 0.504276692867279, + -0.704748272895813, + -0.6428424119949341, + 0.6303996443748474, + -0.5404738187789917, + -0.31176653504371643, + -0.21262824535369873, + 0.18736739456653595, + -0.7998970746994019, + 0.039946746081113815, + 0.7390344738960266, + 0.4283199906349182, + 0.3795057237148285, + 0.07204607129096985, + -0.9230587482452391, + 0.9440426230430604, + 0.26272690296173096, + 0.5598306655883789, + -1.0520871877670288, + -0.2677186131477356, + -0.1888762265443802, + 0.30426350235939026, + 0.4746131896972656, + -0.5746733546257019, + -0.4197768568992615, + 0.8565112948417664, + -0.6767723560333252, + 0.23448683321475983, + -0.2010004222393036, + 0.4112907350063324, + -0.6497949957847595, + -0.418667733669281, + -0.4950824975967407, + 0.44438859820365906, + 1.026281714439392, + 0.482397586107254, + -0.26220494508743286, + -0.3640787005424499, + 0.5907743573188782, + -0.8771642446517944, + 0.09708411991596222, + -0.3671700060367584, + 0.4331349730491638, + 0.619417667388916, + -0.2684665620326996, + -0.5123821496963501, + -0.1502324342727661, + -0.012190685607492924, + 0.3580845892429352, + 0.8617186546325684, + 0.3493645489215851, + 1.0270192623138428, + 0.18297909200191495, + -0.5881339311599731, + -0.1733516901731491, + -0.5040576457977295, + -0.340370237827301, + -0.26767754554748535, + -0.28570041060447693, + -0.032928116619586945, + 0.6029254794120789, + 0.17397655546665192, + 0.09346921741962431, + 0.27815181016921997, + -0.46699589490890503, + -0.8148876428604126, + -0.3964351713657379, + 0.3812595009803772, + 0.13547226786613464, + 0.7126688361167908, + -0.3473474085330963, + -0.06573959439992905, + -0.6483767032623291, + 1.4808889627456665, + 0.30924928188323975, + -0.5085946917533875, + -0.8613000512123108, + 0.3048902451992035, + -0.4241599142551422, + 0.15909206867218018, + 0.5764641761779785, + -0.07879110425710678, + 1.015336513519287, + 0.07599356025457382, + -0.7025855779647827, + 0.30047643184661865, + -0.35094937682151794, + 0.2522146999835968, + -0.2338722199201584, + -0.8326804637908936, + -0.13695412874221802, + -0.03452421352267265, + 0.47974953055381775, + -0.18385636806488037, + 0.32438594102859497, + 0.1797013282775879, + 0.787494957447052, + -0.12579888105392456, + -0.07507286965847015, + -0.4389670491218567, + 0.2720070779323578, + 0.8138866424560547, + 0.01974171027541161, + -0.3057698905467987, + -0.6709924936294556, + 0.0885881632566452, + -0.2862754464149475, + 0.03475658595561981, + -0.1285519152879715, + 0.3838353455066681, + -0.2944154739379883, + -0.4204859137535095, + -0.4416137933731079, + 0.13426260650157928, + 0.36733248829841614, + 0.573428750038147, + -0.14928072690963745, + -0.026076916605234143, + 0.33286052942276, + -0.5340145826339722, + -0.17279052734375, + -0.01154550164937973, + -0.6620771884918213, + 0.18390542268753052, + -0.08265615254640579, + -0.2489682286977768, + 0.2429984211921692, + -0.044153645634651184, + -0.986578404903412, + -0.33574509620666504, + -0.5387663841247559, + 0.19767941534519196, + 0.12540718913078308, + -0.3403128981590271, + -0.4154576361179352, + 0.17275673151016235, + 0.09407442808151244, + -0.5414086580276489, + 0.4393929839134216, + 0.1725579798221588, + -0.4998118281364441, + -0.6926208138465881, + 0.16552448272705078, + 0.6659538149833679, + -0.10949844866991044, + 0.986426830291748, + 0.01748848147690296, + 0.4003709554672241, + -0.5430638194084167, + 0.35347291827201843, + 0.6887399554252625, + 0.08274628221988678, + 0.13407137989997864, + -0.591465950012207, + 0.3446292281150818, + 0.6069018244743347, + 0.1935492902994156, + -0.0989871397614479, + 0.07008486241102219, + -0.8503749370574951, + -0.09507356584072112, + 0.6259510517120361, + 0.13934025168418884, + 0.06392545253038406, + -0.4112265408039093, + -0.08475656062364578, + 0.4974113404750824, + -0.30606114864349365, + 1.111435890197754, + -0.018766529858112335, + -0.8422622680664063, + 0.4325508773326874, + -0.2832120656967163, + -0.4859798848628998, + -0.41498348116874695, + 0.015977520495653152, + 0.5292825698852539, + 0.4538311660289765, + 1.1328668594360352, + 0.22632671892642975, + 0.7918671369552612, + 0.33401933312416077, + 0.7306135296821594, + 0.3548600673675537, + 0.12506209313869476, + 0.8573207855224609, + -0.5818327069282532, + -0.6953738927841187, + -1.6171947717666626, + -0.1699674427509308, + 0.6318262815475464, + -0.05671752244234085, + -0.28145185112953186, + -0.3976689279079437, + -0.2041076272726059, + -0.5495951175689697, + -0.5152917504310608, + -0.9309796094894408, + 0.101932130753994, + 0.1367802917957306, + 0.1490798443555832, + 0.5304336547851563, + -0.5082434415817261, + 0.06688683480024338, + 0.14657628536224365, + -0.782435953617096, + 0.2962816655635834, + 0.6965363621711731, + 0.8496337532997131, + -0.3042965829372406, + 0.04343798756599426, + 0.0330701619386673, + -0.5662598013877869, + 1.1086925268173218, + 0.756072998046875, + -0.204134538769722, + 0.2404300570487976, + -0.47848284244537354, + 1.3659011125564575, + 0.5645433068275452, + -0.15836156904697418, + 0.43395575881004333, + 0.5944653749465942, + 1.0043466091156006, + -0.49446743726730347, + -0.5954391360282898, + 0.5341240763664246, + 0.020598189905285835, + -0.4036853015422821, + 0.4473709762096405, + 1.1998231410980225, + -0.9317775368690492, + -0.23321466147899628, + 0.2052552700042725, + -0.7423108816146851, + -0.19917210936546328, + -0.1722569614648819, + -0.034072667360305786, + -0.00671181408688426, + 0.46396249532699585, + -0.1372445821762085, + 0.053376372903585434, + 0.7392690777778625, + -0.38447609543800354, + 0.07497968524694443, + 0.5197252631187439, + 1.3746477365493774, + 0.9060075879096984, + 0.20000585913658145, + -0.4053704142570496, + 0.7497360110282898, + -0.34087055921554565, + -1.101803183555603, + 0.273650586605072, + -0.5125769376754761, + 0.22472351789474487, + 0.480757474899292, + -0.19845178723335263, + 0.8857700824737549, + 0.30752456188201904, + 1.1109285354614258, + -0.6768012642860413, + 0.524367094039917, + -0.22495046257972717, + -0.4224412739276886, + 0.40753406286239624, + -0.23133376240730288, + 0.3297771215438843, + 0.4905449151992798, + -0.6813114285469055, + -0.7543983459472656, + -0.5599071383476257, + 0.14351597428321838, + -0.029278717935085297, + -0.3970443606376648, + -0.303079217672348, + 0.24161772429943085, + 0.008353390730917454, + -0.0062365154735744, + 1.0824860334396362, + -0.3704061508178711, + -1.0337258577346802, + 0.04638749733567238, + 1.163011074066162, + -0.31737643480300903, + 0.013986887410283089, + 0.19223114848136905, + -0.2260770797729492, + -0.210910826921463, + -1.0191949605941772, + 0.22356095910072327, + 0.09353553503751756, + 0.18096882104873657, + 0.14867214858531952, + 0.43408671021461487, + -0.33312076330184937, + 0.8173948526382446, + 0.6428242921829224, + 0.20215003192424777, + -0.6634518504142761, + -0.4132290482521057, + 0.29815030097961426, + -1.579406976699829, + -0.0981958732008934, + -0.03941014781594277, + 0.1709178239107132, + -0.5481140613555908, + -0.5338194966316223, + -0.3528362512588501, + -0.11561278253793716, + -0.21793591976165771, + -1.1570470333099363, + 0.2157980799674988, + 0.42083489894866943, + 0.9639263153076172, + 0.09747201204299928, + 0.15671424567699432, + 0.4034591615200043, + 0.6728067994117737, + -0.5216875672340393, + 0.09657668322324751, + -0.2416689097881317, + 0.747975766658783, + 0.1021689772605896, + 0.11652665585279463, + -1.0484966039657593, + 0.8489304780960083, + 0.7169828414916992, + -0.09012343734502792, + -1.3173753023147583, + 0.057890523225069046, + -0.006231260951608419, + -0.1018214002251625, + 0.936040461063385, + -0.0502331368625164, + -0.4284322261810303, + -0.38209280371665955, + -0.22668412327766416, + 0.0782942995429039, + -0.4881664514541626, + 0.9268959760665894, + 0.001867273123934865, + 0.42261114716529846, + 0.8283362984657288, + 0.4256294071674347, + -0.7965338826179504, + 0.4840078353881836, + -0.19861412048339844, + 0.33977967500686646, + -0.4604192078113556, + -0.3107339143753052, + -0.2839638590812683, + -1.5734281539916992, + 0.005220232997089624, + 0.09239906817674635, + -0.7828494906425476, + -0.1397123783826828, + 0.2576255202293396, + 0.21372435986995697, + -0.23169949650764465, + 0.4016408920288086, + -0.462497353553772, + -0.2186472862958908, + -0.5617868900299072, + -0.3649831712245941, + -1.1585862636566162, + -0.08222806453704834, + 0.931126832962036, + 0.4327389597892761, + -0.46451422572135925, + -0.5430706143379211, + -0.27434298396110535, + -0.9479129314422609, + 0.1845661848783493, + 0.3972720205783844, + 0.4883299469947815, + 1.04031240940094 + ] + ], + "userProvided": false + } + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap new file mode 100644 index 000000000..4bd0e2c3e --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap @@ -0,0 +1,780 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e0", + "desc": "overriden vector", + "_vectors": { + "default": [ + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1 + ] + } +} diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump new file mode 100644 index 0000000000000000000000000000000000000000..9f8ed2ba11d233ed5a1fadd25a18e3d25c7e04a1 GIT binary patch literal 17539 zcmZtNQ+Fi{&?Vs5wr$%uaZYU8Hai{Lwrv|7+qOHl)xo^qteJ~hYij?6s;gZOaWo9b z|DK=gTwPa!4foP_fj+Py*gh}7OJ}}Z?}P@AB9=-lvq|7(Va!JGr;MUM{hpe~0gbSnv1S*PM0Aea8>+ ze(TFmu_AK)O`T)jCrQ1q@sDiXx$}>3|K-%bpN045>VMqf)!kdCKOR25?(auF-oV@? ztLyAn;ZwC=Ul-q=U)K}R+QWT!Uq7GcBkxPtipb~Z=Sz=6*YsasA4@|sGb8T@7Zf3) zeeXQq9{-+3PWdjLH@=Q{mazL|<{r=g&i(&lI6PP1_jcGvuoV8u<=es6-QCCA{af+j z?tN(}J3ajIF}iy147=}RrTMAn%GJ@AzPUxnwX*zIeY!n9qrLC6MqBTr!Q{W)^6$ja zyrni|;>`B8*R1v?P2aAWBH3lj7vI%3^;sqF>hvmb%k$=Cw$XXr+B81NjR!l|sqEcK z^R~2S14sTbjX&iv>J!y<%H`XaR-F#NuXmTWZMIzN>|6M}>R(#s^eh)kTgtZ6;~nj< zR3q*8Y~}>#mMq`Br(6+834N?fTWVcdi)Vz&K^p#Ceo&p_``A}H0*Ut~J+ku4`J!+5 zZ{;sjlbF3N7?oH1-0Dl^W>Y1rWd3kd?ls5Oj!mTK(!Vp+`kZzBn(gwu z@x1AJU$mf;*=Y^=twGOH-79igU%zf+WJo4dhFHl~ilTc^BwBWO(L)P;#mez`3@7A_VUyq-zHq9hg#K}CJ;B>vo>+8Js zjbRuyTadcfYMR8R->#_yR`q8UwuO4D&64>!Mf{pi>EFh$*1Opf`IS+JBIPL`u5T~W+vD6 zm~{PDCfIy59(|Rwg?YZ&D)QFA52$M8EyxvX(yoMz!Bz5deV?9Kl~-lFsY%#YH=mWH zhF`kXy~g(b4=&d9uf1cvwuR7M-?PpY4thmxV5n&xHttlH_2|H0yo=t}^(kK&z(rmO z4J}g_#*Uu6gpW1_jiJOWfbBl)N#elCVNKK!rW@952 z@Rj}DYE{L?)*aJ_SQ=%IBz4JO{ji|A^?m8jCBrY-f9&pX^atw}sYc^xR=}?`JGshM zOgD|9SzW@$R1hnllB}z%c_BybcACbOSZ#twh7T%Ojm%2h)*4$hHf>&CTX62b@L}Q& zGJ+=t3uOa-8%oyimZ^7Fq!=U@T|%`fH6}@2I=`Oc>LeE-INHg*z#nrvabL5lAM&^5s43jXM!0ylS7G!MysHvay z05d3E(KusrVh}VC12IUZ*?^xLON5lxDwU)r>qJ|P$Tg^@= zI`i1}ljsvj<*U~PeL1IA8rYK}<+)YGc-pjndyTPa&Xr6kJwsHfQ=Ks-< zzk$IYd9gNsRqC}*$JT6UQRjVP>D>%Mv=uO4dLrGztj$)F38HY%K*E5jE4kGJrYdRD z^Q`{9h)D`%_>&rSrh@VBtcfM>%PN`5K%dJ)`nqJekhDoMN|4Jp>8}`#`2$Z2Ff^KC zBN36i`qfRsg>2}uL^h$sJ%>Oh&cj?AH8SHx9A48&POoJ(_qLi~(`;Ug-?#Cz*-iJ> z>>p3IL~UxX%Z;)13j^;}NDQ3aUB?D#jG9W zXoDGfyM+Q;-Ml)2_L%3$!ebCBms4EgG82ZJ0D7`K^zEFig(gWVv!S6lN&^#2K21xq zZ3FyZ!CZnO5zVd=!O)I91WJLR%rm_qu_7aT6fCSV%}wD@)Y@DX#I%`oN*020VgAC+ z5H6ZFnHFg0STA8*F=A>6xax~H= zF-EUfQdOJ_%PdX-;Gox~_r!}P6P}f%6Qa&L8;@7mMR9_*&kk05cV&!_UjqER2T9UI zL8oHO9TQ*9Elm$@*xEp=C)2MRES_x7YaNPc4# zQ!tm&9~DPbZKv(h&|-k=#wcAqz_Mv1mXFI=j&$+}GK@bM^~{Hi%4`IT2VVwIK(3sg z>cLeqlQJyJ0P!9W92sQ;byHq-Uo;eo6X$6v5*^pVl}OAnVajJ|mpGLWfyo>iTa3CK zpS4liZa@l&T%IA;#$f=xhVzg%7ldsg^7(@qJ(cnHNw)epO`8(H~623t4V`S*p0??r07g<7xOkvg)V^Yg@Kapkb8Q3)5tDVF_G~7TvA! z;6FaRRp$A?am+g>6#{RE%uE)iC>S>vd#bW?LE7)^*(C(0C?q*`56G-DDq`i<%kf5*Jrt`??`Er9Cm^uZZ;IVWD1L-39DL_$zXPDY1 zGn+?~m$MWhb~OS8K}mx4n8h#Y9unHw2aI#WJ3$c4T_Vo zDJI;j@tRz@Cngl8#7D7FhUMO9hVe*OUV-J`*QJVWfIiq}d&Nu_==s&ebk+i~jax^a zBzMxn51#@}%T7l6Zk4M~ACalUO@F;6Z^eIaBbek9L~dWV2%G6|7*nhSCK?K3wk4Z^ z`AsL8h`KACw1#`{6bwV;Bv%kDXhw!HX5}=?2*`r~P&89JCtiCBNgbB01DXm#j5b^~ zwA%2ELDq2M{9CKoD$RJCsQJQUvArF{Qd=Y7RC`cjYZS|(7#0F7U5PU9WsjiaO5J?> zAyx2p4BI{^`+<=u7=;Orl5T~RYQNiq875N0e76Ikx%yGhx7}x6yERTK-Nt%MKT*8*{VNejntbklb+%7 zs;2o59gaQWP9rD@8fr))F)xl{3(UVOz=I;oSE2uS9JMBG*{Nn&Kmb8Nb#OwlU45IH z-PmeIlzpO{d`%5EVEV^#y}=aV<)Jbv6eKA~y%1=>EWFKDQlT%2p33a}UDp|kLx)tm zwiYakq3%jkCMT>y-ymQ};Vby1VdSy{A+6gfT#t^~Gl*T@2Z1z7TMA<%tYHomS37x#~0Ja%?W#{LIL~>L)SGyKsSb*V#p%Ubm)>(fr?6xjRdSj&L)nPB~PI)2rQ&IW>8JY zj$n4F(xHJ|S03mK_)}`*|FHK{-_{|rFlp?uWg{wTq-k^8Xo{}P{@N7)%OhcwqG6tr zS(&Q_Uh<(+bk(src{9B@#Fd0Ji(me^qD3A0RkXI-`vbxXJ{0AcT3-ZNR-szw|N8e0a1K zUs-r_Vh}xx8b(nD(N8p+ah3w9SYCpk*0y{O%6gN)tmIz}i9Y3pvUjk|;Xo?wl z(8{$DKo`k$mKrv*nz2m7rse`&Li*U>Bce{+Oob8-1fLYT{gghjct#vtg$>dZG)k7k zT&$mLbu{P;2x1oC{+&ZRn26`8Lg`{k$xZ)=!xV2qopQEiN1L`3#~Z(t7QPRvEUSkJ zoq+8bybGs?-$_7v&PrziIo?Bep;hb`kZXSw)ucOSf{K+`0%!znmA8z3kYQ_EB;}Ri z4espCQo@BKLr!qwF-%dAB?68e15~_PXBx?f%9W<|D;W1P2IUxux2|G`*zC>51Up-d zU&oka1fC=!8P+!F&x>L4#yCD5xLdc`1=_yO2=sRVZG+WHs;Be%~e=XbdP|TMf6`7HAAtSTsHatt7Rb=l> z@a3G`ivP`kQAz+PKJzYe`@Y9%`0yX6v4_ryd~NVleu`NVM>HI@60UbjR<`8MXT;Fe zc)M>7Vaj-Rbmiu#Ve3a?g<)#`WOba=7*gC8i($g@1Nfb@5Z$3&&6Giq!0D(JQ_e}7 z%k0?F^z1s+Lt$bsrd{H}6-Ck7r6i(F+^LpSSo9@-tGx?bfFgo5!=nf!V^lQ!C@8Gb zT7sT*wGh3aC8(ZKUubG{>>o)wMO^1@&%4M()^s+IRMyd<7Boaht(PQVSixD4+-)=z zcB%TphT2bPF9)}v>N>!FhRo&x!84dWp9+Er68+Y}GoP z$NVWPrIqIJe=B-a5_$QQ?pEWv-8n3 zNvjA)z~RcZD!XVWvZ0ZAiiwj95l4{lh>@?PX?p@mlkX25@Fm&n%|ya}bMuIk_Ci_6 zL$#xt2E<8q;Irn(a9SfXfXRXa#)oHdnc-oJ_IDdkgY`y6tc#c@>JS{gdjqIDUJMJ`la!ii3C6hc- z@=!k|g_|^Ho6&0%kfbOncAVy}k>5@TO-d?D4b|How7vD)*n7o6okAp_)ea(e^&H3@ z%%*mXzOf}@YttTB% z|G(Ir-2n$M^bjTyHQG!$`rjq1kx1h{EZ5F%v?OF0{3yFx6yaQmJW_O`d7uLcW=SYqVhapJ+TDEXjH*D=5%u~8Gf*sY zWvju?19bej0w-weziSEFArMOOL9EDS@4CB5k@vBxk>6M zX-5&hG5)|Sw@~*92{5&w_*K%cCzx3{uV<*$)*%fzqs)$u%*Wu|M@wzR=^v6VbvX3* zMJUwRB!wc39;v)rIF=O_caGhsXrz8%v zLu0>FI?ssDKKqGgj~lfZB3Wt(CK))D+9bIQvF=u}s6~LGOnNeO%?%PoiX0@Z)YdD4 zYdnTy$h3a_dlHi1mC-63=T#aMO01FaPyCtK!&&i@$>Trh4?%R=ku4a*|GdOE8nyn} z2}G@10s36k)nw`3xAbDO%9bd;Ln6a8$$11gJEU^){VBC0;m;wGKA@yA8C~d?Mg2Hj zDQTrP)E$O_+R!1yppr@S=)zO8iKGsNP?5u?)Z~fuI$B;~{cHODE(z+fsr|g;PAIIR;hZ5gH)<|WI+SFma(F)XQO>8Myb5KVB;E?)B zR}-n?@A$$p$I)iBlB24XzTFnUKXmf}pkIm_<=}YjEim>Q#%y*ezNx=}!0t$?QapV@ zcjL0m@uL*?!g5%Lq#i-c?A)wk71?dSjGnu8D3nAq}CWt zi~t+LfOhc`3V*TVu1*@Aeu@Zcj8aJkW^hVNDndr3R=Y}(iLn;=6zNQQkwVRGq!Vgw zIoE+FaKnzRbIC-dgQEaZX_)Qn*QU#}%@8?4t^kctk5nc3a^_`mva*!KWIx0`;Igh8 zXljbQF&+gXHddIoWC9WNcK_XK$P5lt)%ObNbikCH%lt&A*mmVM-9sS~{ZwunrsbsE zlx|WrAiyAz(7k}rRi2U>#UN+_s6ZW`0A>}cAdMPi!u_rhazm(3oloh7n8jWKBTe6= z$VjKlf=TbYIXiqw)Eo7SEMEgeDVPzg7^Aod4$=xcTRsj}Nnk*543(FH-Ab*;-8&3B zf(80etli1|C9rGAq2QzRb@Z_RqJGC}J%&MDSBBFL1UtyTArFoNA_~dMAUZ}$uMCm)~y{QGtfDdk{0!OQu4bE9)0V7W(f&rVp9Z(gks$`c?aeLjpb z#;~MZIiiqSL>Ffb4moL+aH0Onc#9Yg!&%w_qBlCOuULR^XTnO0K=H}w!G>oBaaXuY zv@DR?-2h5LZE}LG0W(kumrgVE<%?7uNFnx7DD*t(q041dL;JegIbwQ~ZkWX17-v~I z`AO{K0#3rBY{s0nw%iU(i*1L~et?LL0#GlJXLrIdSl(~Fw?n&G9HZ`_NJ(+%2Jj0V z@+|`ThXFokoYHk)iKStU^(gTY+^qDN00)nAB{VQEc6Yql1oq~LMpPCshBH3e20Xg^ z<$28BvkHx3E$9TsXwJkc_4wUsQ?mh3ZN!@-VMRETu6Dwjz~{LZW+k3UCQWjdX2E4^ z63~M~;~;)NLm1R*3CZI#xBe3ZwAebu2Wcoy`#+t0uHf4v#MwlkXeprz227v?A~H!s z%G)i*P*^>~5)=`9!xZL)P!`k*>rcx6lCpw#JDm6n6>!N}7jSHb{Ax_5hMf}PN`+X1 zBuaYdjkPF%3$;dg@xWgw_LYDGf>em4R(Ow~m<|9rcARF{RqOCBU>htS{w%&KeVQU8 zW!1Z}rQ(&t%de>yheb*%*(obZA>m9+x{_mtMs5JwB~D^KJJ5%eF5~sIs}T)Br56{T z!By0{42N8P;53_N52~}_kHm~wJxo@`x^p5|0aLm{v##_b8D+nie0beN&iAp{wp-$U zZ2Z1Aa06YP!XD)slS{2EieEC^)1Fr$yZU4sNBM6Q4?$-~3s9PfLczm-O=8`%8Y2+BIA^KT z4iT-~1^%Ka|9lC=e~*&+C~L|R|PugRz0}B z?3puP36)|6J)MB`S{jc%jEG<~+e};(74x%+W{b^Z_<{T5>Ix^u z+lQnm7piLpN_l@Hn5*srNTohal{gD?nS#oj(hxV-$c8WSqJjYvYV`S17bVk^gC4D= z$DRNO;D)r>YY#XpTpTheVguzTkD-DA3(JhC7OzEbuboMAFdOES164M44S@0yHtN1` z1FNGMZY}51l59{x!_QHIFHV!$cDa2Ba~*+0c>#%Vbf zH+%RHi2x3hPNL-COPGl73t<`ZL5KR5PttdbpcoUu<#}enfSn^D-WyOQ2kb15V_+z|+p}+?Dh+^|n}FG7VMzU5IFiQz?fc{A?rkN-5Fj-r{DdneDTb!Xy zJ)R->kQzb?4Y-**q;vj$QlOBcKwcHWO1B6T8SG$FL4im%eDa7i!8S8H2g&#xJOxQ3 z%{m}<9iGH+dqTtj&_agR^i!2ma5gXP^}6M_Az+S79~b;_0FYV;PFjd}2C^9UFK7%3 zU9GkL;7{j^h)|3xM54WiILYA_Lc#`@&~-ChgiP_lR)Qf!G5Dr2mc-=YEb5t5!&Ffj z+lDfU$o;V?XY5>og?en;7s72p_$@cJMf7a%lnpD)8t4?KWs%Xn=q;0Hr9lC2@kpOq z=h%p=zoUTY%H3>PU3f?$m+2(TwG-=zVL~V(Mna>kUr(?}xI7Rhbli`-^O!^NTFBj=)3PXl2jQ`7~2STZWX|EZEz1BBf@lnM`v(uMO`36feL|Y$%e@~K8H-Xxhd>d-pCHf zw|t{|Qjmm)fHicP(?p!G0$gC=RtBpYieH1al63a3eLUAI=I`ziIr#axe*M=RAAFui zuUV$IhcoYaZBJd@T2ytD2X;jx^=0=j?RY?oAdg4secUcER=4CS-i; zyF>(qka4S>QT}zqRQaoYz>$Kjqwegs%eV-t4R24Or;43;v( zq}hl@f^ui#s@YNs6=F#~pJp2?K7X>_U&C6g1cOols*I?hEZ*2~*1xb%nI2&4j*G9y zp9wiBX6(&BlfHkVv$=v}(?&;y2jyxAL{%2LhK5T!ZWJ{pHz=S0^><5!!&{gn`SPw9 zTG;Cd1Wbmfbs&pFHY*a5kz@0TGFcHpXuEPyqvFZE@@Gpo4*nzBp}?3>Cp188YKq>_ zgE-!DCD1HXG1$rptW6OYYEd2@tfudea!D}J!W&RxGdfu}+>mw1s?kbwRmUZa0hK^d z8}OJ!+zGJ%&=^BET!mGj`EzkAj6A>8SzIC}RzZqwfRU zXf+X+TG`}O*GW?@pcA`|HDG;@!`$V4_W$7Uf3#*V1dTwq0S5gFJBgx{wRlK%Q9eD; zcJ?n5ni0IXU0_&%rt#EAV%n5Ny;x0jkkoGlW}+5N(jEA}&6^Zn5lQ<`q8cJX)T}wp zrrH<3GKy7m+oO1qIBuY5S17D6gkl(4ZV{g)ennP+{3+grUIRr#vw*=x625AJ#K5u% zsK&%~Qh-x`B)UHQBy$8ymF+4$aEk(#C=R9t1#9yYen%M$B)*~ zwU(qtD*b0m|nckJ@=uCp6Tp{v@q1!I19$`6EN3T)Hf!9nx) zS)4E@B7RYsW8z^usJ4#Gf&T1D8tgB)08m$LeoG7NLRty=-(FnfA@P($WH=;cV}rr%!Lf|3o->JMU`iEd5Kw|e zvL3HG67UQZm5VbXOw}9fXWY5aci2m!nKW6~8-Z9?CTkA(pd17m@W?7wFc9KyBen(8qxuD_wS&+s$cXI) z1Iw)GDV?Os)e@)fmC-O62gCy)o`(&KLOCGUbt{12Km6u!BfnTYC!)!=O^pE_{B43= zqJii584g9r%*5389FA<&jiI#i815UJB&?YY_Pdm(w$Twp*NtC!_&2_%>w0W0ASei0Z(6j(Tv>U) zNUKsNA;xdUZ<8~22E`bAT(FKqB45C~8MLJigD(S33xbyU}u3rfgT=6xU@ht3on=fe+P1tHm7kuycl z#Tl!4q(q?LA;*+ti7!blv`#uLgnVMK<`@u;@U^x#w1=P5<{P=Dw_a?M74z|S+-MB+ zuHQzeQhl0KpuxfY3<(ju=+_;;{9Ut?pG{qt02U65t*&#kgHj{=aPh^PHCy`7qZY`mq#8Nsy4AHT zhaPlbu%sN6+juQniG&}_t5Uc5ShvnGUL~~~8{I-N1clVddL@J{_K~OzMUzA67H+np z{E|E9z}UVNHpH+{Ja19pa*r3>Fkkj3Hm@>CpWEk>#j>@z{o{B_70VrfWe4^^gy;j4DCX^2F#jZOW2#1n~NuRd+9+hO9jju$!1VIP74yhaH~OmVUxq&$?!K4EeD_EgBKf{!k!VN=xZPG zTkQ7(mj@RItlR>3IcaW}5SE-~!R3m5ctBHrkHJ#;&i{;fw2>3%zrfeHQM)>&dURpe zQ>o?-MQPaZBvet6L$&pUvG@DC#0DaTGlIPy?VjZjUbIOl)b`TKy`_=!loS9Dg^K0O zQ_Cr>i&=qq9nZh*_Ml{4b!}QAy+xsqJz=I249HEw9vtxy<>UOOa%CVC7@=^y-un?5 zd?2gc7}UfvLL5M$d4A~uzn|d~+`^=J%18Ub5p-`&!!TKB4phRoR#sV(b1nLAl(*a{?8}UQ5pV

?5&|K+bKfN?Il8*@tFQ1&$%q=BW^gEU?60)HcZZbC2arZg6l{D?Pv!@$Lbgo?68?6u4i7Yc6f{vRrj?}!1m8S(<%FdLeEdOr+o_5S~q3M%&Pj~bZ0{9Z%(GLMOmk%m0 zDV}HXhuAH0p{j+0`YsV5%|ZpHIYRBPmNX%sz>Yp4V)GmU>=L;IHqhfBt~)rpWqe0^ zs}hl?20jwrOCo+j14Y}D#SbRJ)(&5O79l2RPx5$`3fw4Ce{0dXi$oj@+}Wv5(@&@1 z&|dQ1>O9ukxkDu|8AT+69W_IO%LEh)^%&xxxTX`gp}`aCP)|wXIPY_nX12<0NHmIR zs~H<}B&q7v=;7io;x0~5V1qEt$#~K4l;9v(sQ?K4olM1mgcfZU)&xjBP<&KSnAn(K z9tI!Np*1WF32xYcUex1pRR;u8YkSy|G}N}oFt?gCB&tEgO+L62zdE3>LvP@VWUQ%j zUecJ`f+O532Ndhgs2y-UkWpc$I>5K&!_nePFRNc^NL)GH!P6G2v0zc{6!*(c#dw`U z2_)0E{~_h3$S{m=ljktm&2$NlYr8?pmL=juL6w};jDR{fq)AFtM837Ai>Q$2Rb0`^ zOz|$em6AwFZ8atp@fWo#7>>{p!wQz<4{``+ns&kPbO@y4z`Sq$@IV6?FL6udF=Hk6 z>Wpfqm4kbOXgoZxiZPY^v}6@gXsap(#V`-2!ssWoq=Fs9)M&xE)6m+&8_EP%=&pha z0E#FXxN{WAUS_ep4VEJZ309uyQviXdE#)D_+rWjYS8k`J$PzM_dXhV6B*>IFI}lxN4F*9*Z^C>9XJ@io!Ojqs z6j~;)dE7|&%*}&HT3tcSf17_AUGR2L?sxF|*p9o@E7r@Ti-Mr-2up2_s+dN*8)j=^ zMdsFCx&!pcpbzcFi78Wb#zNgrj57R{;VfbcOPemK4`V8GR??YB>yC)6cm~&?MgBXp zk_~_Zn>`(gSj3eGj?-rzOT;1?Pt>=Dj`0`|E;shNZ4r+pt$5G}t!C zax|m!B=%UrYJ)lfPuV>VGh%ZmO2Z(uQO{M> za4QZ5tycl@t40JWs2Y_+IfW`#IQi;b(mTW9y3KS9lXS{612RPIh4!CH(A z2ssKyG^}o@k3qK(0=8}eI4w#%qY$*7yEk#}0A(@2Qv!>`HHcJtL3Yh6!fIia@jvE9+rk;ll9U#9KI z+S+(p+Q9%y$B-onBWfe`;JFD`bbfEH%FC+0;5&B>o?d0@WfJa!mjy*G31;y@iVD1I zJSZJKWGrQ-1A(B2w3*GbCuA^SF4IcXI(}R6a8|R?zZr`_KCF)!eVt&6390iQ<5F2v#qyAL` zGz}H8u0=-V_uCmFWDd*|Wr>Tt1TBH@%hWHW7Z4<|Ml55*Iw{7~6!#(=Sv8}BcL!Dx zetK5~gWA@R5PmKFwb^VWgjPzAqeZrlcyW4In~8bb)YQer;OLr^FzkX00eADTK4C zpWS}tK%B%WpqCfoN+N)gQ$F=br_}vb{k^8%W`%2!1rJ4$J+JtOUyF;dA|M=$<;kwl z;|@7%qSVZPfb@1EhXWWYm;WaUtyxC3hyUG-0LuoL*O&!f%gYgRJ-`{!wg{3;H*|C0 zh-cd(;)4Alz5?XKvb4c7DyD}dF_AFt$6i+c)C=R(iEYR{DYovlQyUYUt#08Y#eaXr z1BbZN{|o9Hx*97Oo``aJdk*WApme@Zqns8d+tQgh9$mypZCcctL>G+hm~k&IhDd9u zTNr;nk+;pGsi_=}aSV-iR#bEYBhSLhK-h3U5Q4@vOoRAAAf{p6NG@vGRMUJhFFDQ1 zYQGb!HauFq=_fjXC6@Ii&o4+j0CVg~&tD9tVHNBiPSJ~$2l*EyS&FFMnuFx7qb>G* z-bHO$@UMnz<7Tu@@TEc;PnZ z<~Y=!U#Mm%bpd3o210ML;+wHe6*mYtR19c|7ZKlTCxgsC%(k}a7XxOx9BndpdQLP? zi{gQQ(9Pdb36qgZ*3mrS!0j-&;fF6!$=)15GYIdLIR=D9B_OGdfd`WJ35$~EyhNFc zusRuxvJ>Qo%5oSEd8FL`^dKzH?g>O98Pj&&ChlMocL%iwwa#zg!kKqeep?qMbKoyV zW=VUVpUZMEds=&|F&l%;Wx-gmBr}302F4x!4y|lbqryd@x^0mpmq%s-b<*|5bjVo5 zFAU%MF+%j9#wB9aOGBK@Yey()5A6(^Kpa3R8(M6e8Hy+2TRd;$)5us^^?s8JwI@(% z%-(<*P{eYa$qZ2*qv2I*zwow-gMYPd<8{0yr-JsGB%H^x$4_KEIhR2OFLHTyB~zKk z&S!<=cu8=f$2oZdm(xqLoMlL96lKAK^6^a0gzjR!CWA5O?WM9%srpXbD@v9ZVWt*l!+H9*{cOU^c5jn2J+?W5A-w%M! z9bMwmyJ*Dr?fTP}O^5NfTOeqN(xKt5&8^YL1!PilsnI@Y)+^5yfVR>uaYb;0L{(-Z zRW^DPh+XOpx8E3$`7zzW&Ni*8YNLfQ znUb>Ihe0_gUF-jX_kge1ZdsiBReK??z zXFuLiggN!L0T+@Torw$id;vA=O z9>CGhuTN;PsCRE>TV{c!voB*HRs)^fe6B9TVh7IeIE21|%Sf7Ou=TeEO3Ne(dYGgO z=LTv8skQ|=JQ|7_yJ(GPQPrBKl8Aap{6jrP^*0*!4GcK1nz?MyV9m{3#>R_k{zPHk|tC-VrNDp#z20zZKz32J43 z31iH*V47NEj6edMoI7H;2?g@K)!60p-*F-z9@Adhyku~W0oC9$Q7uIvt9)m-c8H8* zi{GGL@SX;-3k2VHRrw#s1+a7hbl<}fTw20A_Ia9~%9BJd)Qp8_0No5o0As&%@W7o| zV0O_)mDmQQ?bm7geV{C&`C3luD%^>w`aZ;odf zw;E9sICQO)&{8bO-ETJ0eZQO*IXdI;?%Hy#d7C-YDXVDR4FqeB^>>HxzC{Eu2AKHb z&g|3WxnD+je1c|51e!3nQIM_fB)XPW6h{fmG_^P6RxvBVZ(lV;qSi++Rq$uHf58WP zvKqiL(^-VA%v)R^l!)X4b>$F$Fcd2wriy$x-2z7~%4CBqB^7DnAXTT1)<4=thqgl? zN+jEGYq-}fS03;@;_vp-X3Y~63XN1Kdg@G$;iU%NrAe*?X;Mn?ATXQLKK}H;Tp=E3H$6OZBuTi?5aj9|X zN!o9T);U`v%-#5Dh{lxGFDZATcCHjd0p!`$A+fc&S8b=A!6G6p|C%pxR%KJs$c+3I zreZ9LB(oNb>sFuw$&?>zpg(O~^ak95QO4Q2^pE^~?3A>z5C^R{A zlY zJ8Ot6_qI0V+xntGG~&;A^YCwJ-N%d{@@A){rujokC(y{aezGuu?<+I*ZY~#7EDVAy zh=oZciR?M+Rq5+}H-Mp2mEJ`B#S>ZD)!6^&tND&F%Q6$olBAK%(gCW)vrW~_r2ztv0} zi}_djZ>8bBx;K#&KBX$n0mGGkE z-R@KSnrGOGeOSm&i`-dTp2Xr#8JrI66u%veXbe6*!d@hJCGCP|tbGB#w|g}$Xb2%_ zwB0XJ)ZG36DkY=1czg1AXy<}c=uuvs(a+s5_(DT7`DRNEyjJosXK^XcCoZBKsZ*oq zQ~to%f##|)X!Zc@+q~z~$^`FDZ6casrCvnL7SR23Z~zs4>KLaZMK&%3PJUdE9SDoX z+*N^LI^nfRrSMI5kf48jLpvPA{KNU0)R zT!Lo5svY{?QCW4DMwcCF?X`tyasnLN2!fV%uLzhB5ewWEK&{0EYolg-QXR5F$Ubik zG&oXz6W17VSGMCZb=*~uEiZ**1D%t~x%3ta_Q(G>#-Eze_g4j;!C82jh7Ei)m*SD2 z^LC|G!8wwIJ+XR}=eM~I!j456r-h5RF&q?2+Utg>K!G2H6*P^!!lg6hgT9B@a1#=l zcMAHGKATb)F^j-x@>OU1{^lpB@8Gk2{@rtVcs4$v$FqL!AoABh-=cg|c)alcFaF=c zGK{OsB;SQGhZBD<|M~wPVf#hH))2o-a5FnQcQc#!vvvKvl6-qE0UcK`L9JE|dHk*z zSan}ctizGVG7M!9ob*U7@FZJtnEdqrvfWDqqVLbd9wGlRb>r;LC2r?v zE8Gd+N0P_Z{FBx+pMGAPJjVcS$dx}V3B6@MV6{ZgKj1wO9s1jTF5WM=Ha?!N_YdDU z{XD(p{$FZHZ!b7L=)^u@W$?@0Y=i$oE$a8AVHs2Y$?`Jc2f4t${mTqN#b$58Q|Ht9Q z_rpW()W7$cgO!omSrYcw2lhIXogwCW=2jBTdZauDuMxEIw%wg=Vuk$P&W^)zL%q+v zp}lh#x0Zw1%W80c|Bs?ej32*0yL*56cm{;^{^FYp_(_Dj#QukT0KN{);!HQsPDZ~3Ay^Y{@QuIT=VzG7ri?@?TCB%x!@D%PbS_UsR`cr*FNf> zF;&0rAEqyKt|-)h$-MpT@BSmqXQL4UlCIbrzUK4y=d<-KKio{<;|KYUFYP=j=HhwB z5t;iVTyOpTIs56!9+lbuy8T_&ynB3^SoMi`@6z_;2b=`tcID@t^PeK~PVu*Yz#8k1 z@XZf#JV?n$Pxi#!4h)Et0@0V+!*6kr@1>sq_EZ=I?5_7yxcuGyEB{&fe~>!skN@<~ zPsisEGdc6OtT&M&a1Qy$R;S@%+#K6)XhP`}}7ofJUbaQaVrV^?6IL zpE2+DSv%j(p1tgc_N?#q@baA2{=-W$Jtt$%2jeSA?-nfakZUU5xKLpF@!8!m8(Wx^ z-}x%kRAiK%`(erX9Z>51hXQzi<8n|IJiGer?zhUYv>0c1Cl|LTSJ!tZXUAo_Rv7KP z>G{dup5618|9bM%Yac_b6sNhq(g!}de|dTF>N}S6C+Er)zqXold-<}}SNYlNxt%}b zVXn_!&eyM=z07HQ)ehUM{p{73SM#sGw}oum_6M$`k9@2H{CHk>XSct*ee%P;{)GB3 zP56)X-w*Ke{(lsj{jm_e!w(<~e@gv-j`Xc)vLpS9$#bKhPaPhk)OQ*0pguiJN)=}6b~c5 z5W?`=YtAa1=u+tOPmctXcW7L`{4bgR_jRQNpm4cqjyt544^FV2rc6wayM z0M^~UAl1~Hhw~RF|2oI^uct~}@2~imH0NKR!{oR==M)!libdf1`0Xhkmhj<_n}a0a zmip`E#ntbSlDoT4ZeM%^QSkQpo5RKBMWn#bUtPcX$d`Nj-@g6)?~$qbqWh7A<2@3! zj$=ac`y@(-Hs)K*)ZdY)_9tlhUxcCEUQ|y^?^}P*9;_pI*3kR*K!=v++AWOt!xr?% zE%@fu+F#_sQ*~1;?00OzIE8#G?(dZ-UcEYee18$&o5RVQi`!f6$f?4~^OMi6?@wOdUtIcciZFE$U0WwF58A!M$=%Io z-n{D*-S4?}tZWe^dy$KJ^|Wkzb_!`Q$-T^F2ml9XU&Xuc3}Z2aoc5 z$4KnoVI;oMNI%+%M?3N3uoK^|V2j>CL-u_`M*AozO}?06{8y?(40KE57bkFUqq Q|Bu)I10^F{h5+IK0As3ms{jB1 literal 0 HcmV?d00001 From afcd7b9f0c02178bf6f71fc26a078538666afa3a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 May 2024 18:12:26 +0200 Subject: [PATCH 36/56] Test hybrid search with hf embedder --- meilisearch/tests/search/hybrid.rs | 121 ++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 9 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 67f7909b9..028b341cb 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -5,7 +5,10 @@ use crate::common::index::Index; use crate::common::{Server, Value}; use crate::json; -async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> { +async fn index_with_documents_user_provided<'a>( + server: &'a Server, + documents: &Value, +) -> Index<'a> { let index = server.index("test"); let (response, code) = server.set_features(json!({"vectorStore": true})).await; @@ -34,7 +37,39 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde index } -static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { +async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> Index<'a> { + let index = server.index("test"); + + let (response, code) = server.set_features(json!({"vectorStore": true})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "exportPuffinReports": false + } + "###); + + let (response, code) = index + .update_settings(json!({ "embedders": {"default": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.title}}, {{doc.desc}}" + }}} )) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + index +} + +static SIMPLE_SEARCH_DOCUMENTS_VEC: Lazy = Lazy::new(|| { json!([ { "title": "Shazam!", @@ -56,7 +91,7 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { }]) }); -static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { +static SINGLE_DOCUMENT_VEC: Lazy = Lazy::new(|| { json!([{ "title": "Shazam!", "desc": "a Captain Marvel ersatz", @@ -65,10 +100,29 @@ static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { }]) }); +static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + }, + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + }]) +}); + #[actix_rt::test] async fn simple_search() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post( @@ -98,10 +152,59 @@ async fn simple_search() { snapshot!(response["semanticHitCount"], @"3"); } +#[actix_rt::test] +async fn simple_search_hf() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = + index.search_post(json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}})).await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###); + snapshot!(response["semanticHitCount"], @"0"); + + let (response, code) = index + .search_post( + // disable ranking score as the vectors between architectures are not equal + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.55}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###); + snapshot!(response["semanticHitCount"], @"1"); + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.8}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###); + snapshot!(response["semanticHitCount"], @"3"); + + let (response, code) = index + .search_post( + json!({"q": "Movie World", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}]"###); + snapshot!(response["semanticHitCount"], @"3"); + + let (response, code) = index + .search_post( + json!({"q": "Wonder replacement", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###); + snapshot!(response["semanticHitCount"], @"3"); +} + #[actix_rt::test] async fn distribution_shift() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); let (response, code) = index.search_post(search.clone()).await; @@ -133,7 +236,7 @@ async fn distribution_shift() { #[actix_rt::test] async fn highlighter() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], @@ -184,7 +287,7 @@ async fn highlighter() { #[actix_rt::test] async fn invalid_semantic_ratio() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post( @@ -256,7 +359,7 @@ async fn invalid_semantic_ratio() { #[actix_rt::test] async fn single_document() { let server = Server::new().await; - let index = index_with_documents(&server, &SINGLE_DOCUMENT).await; + let index = index_with_documents_user_provided(&server, &SINGLE_DOCUMENT_VEC).await; let (response, code) = index .search_post( @@ -272,7 +375,7 @@ async fn single_document() { #[actix_rt::test] async fn query_combination() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; // search without query and vector, but with hybrid => still placeholder let (response, code) = index From b17cb56dee5a21574d3a35b2e92a17f04a44db47 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 May 2024 18:13:27 +0200 Subject: [PATCH 37/56] Test array of vectors --- milli/src/vector/parsed_vectors.rs | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index bf4b9ea83..4e06177de 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -147,3 +147,62 @@ impl VectorOrArrayOfVectors { Self { inner: Some(either::Either::Right(array_of_vec)) } } } + +#[cfg(test)] +mod test { + use super::VectorOrArrayOfVectors; + + #[test] + fn array_of_vectors() { + let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap(); + let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap(); + let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap(); + let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap(); + let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap(); + let two_vecs: VectorOrArrayOfVectors = + serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); + + insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); + // 👇 is the the intended behavior? would rather expect [] here, but changing that is a breaking change... + insta::assert_json_snapshot!(empty.into_array_of_vectors(), @r###" + [ + [] + ] + "###); + insta::assert_json_snapshot!(one.into_array_of_vectors(), @r###" + [ + [ + 0.1 + ] + ] + "###); + insta::assert_json_snapshot!(two.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ] + ] + "###); + insta::assert_json_snapshot!(one_vec.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ] + ] + "###); + insta::assert_json_snapshot!(two_vecs.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ], + [ + 0.3, + 0.4 + ] + ] + "###); + } +} From 9969f7a638102473fa6f404c1bffcec9f1d866a7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 20 May 2024 10:23:12 +0200 Subject: [PATCH 38/56] Add test on index-scheduler --- index-scheduler/src/batch.rs | 3 - index-scheduler/src/lib.rs | 230 ++++++++++++++++++ ...x_scheduler__tests__import_vectors-12.snap | 19 ++ ...ex_scheduler__tests__import_vectors-2.snap | 20 ++ ...ex_scheduler__tests__import_vectors-4.snap | 23 ++ ...ex_scheduler__tests__import_vectors-6.snap | 11 + ...ex_scheduler__tests__import_vectors-9.snap | 19 ++ ...ndex_scheduler__tests__import_vectors.snap | 20 ++ .../Intel to kefir succeeds.snap | 49 ++++ .../lib.rs/import_vectors/Intel to kefir.snap | 48 ++++ .../import_vectors/adding Intel succeeds.snap | 45 ++++ .../import_vectors/after adding Intel.snap | 44 ++++ ...ter_registering_settings_task_vectors.snap | 36 +++ .../settings_update_processed_vectors.snap | 40 +++ 14 files changed, 604 insertions(+), 3 deletions(-) create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap create mode 100644 index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 40398dc37..1f5ec76b9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -952,9 +952,6 @@ impl IndexScheduler { .into()); }; - /// some tests to consider: - /// - /// - dump, then import, then change a document with autogenerated vectors for (embedder_name, embeddings) in embeddings { // don't change the entry if it already exists, because it was user-provided vectors.entry(embedder_name).or_insert_with(|| { diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index dd2b296f6..f743422a7 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1774,6 +1774,7 @@ mod tests { use big_s::S; use crossbeam::channel::RecvTimeoutError; use file_store::File; + use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; use meilisearch_auth::AuthFilter; use meilisearch_types::document_formats::DocumentFormatError; @@ -4982,4 +4983,233 @@ mod tests { ---------------------------------------------------------------------- "###); } + + #[test] + fn import_vectors() { + use meilisearch_types::settings::{Settings, Unchecked}; + use milli::update::Setting; + + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let mut new_settings: Box> = Box::default(); + let mut embedders = BTreeMap::default(); + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), + api_key: Setting::Set(S("My super secret")), + url: Setting::Set(S("http://localhost:7777")), + dimensions: Setting::Set(384), + ..Default::default() + }; + embedders.insert(S("A_fakerest"), Setting::Set(embedding_settings)); + + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}} the {{doc.breed}} best doggo")), + ..Default::default() + }; + embedders.insert(S("B_small_hf"), Setting::Set(embedding_settings)); + + new_settings.embedders = Setting::Set(embedders); + + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings, + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + let (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) = { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, fakerest_config) = configs.get(0).unwrap(); + insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + insta::assert_json_snapshot!(fakerest_config.embedder_options); + let fakerest_name = name.clone(); + + let (name, simple_hf_config) = configs.get(1).unwrap(); + insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + insta::assert_json_snapshot!(simple_hf_config.embedder_options); + let simple_hf_name = name.clone(); + + let configs = index_scheduler.embedders(configs).unwrap(); + let (hf_embedder, _) = configs.get(&simple_hf_name).unwrap(); + let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo")).unwrap(); + let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo")).unwrap(); + let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo")).unwrap(); + (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) + }; + + // add one doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "userProvided": true, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "userProvided": false, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); + + handle.advance_one_successful_batch(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "adding Intel succeeds"); + + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + + // update the doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "kefir", + "breed": "patou", + } + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); + + { + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + // automatically changed to patou + assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + // remained beagle because set to userProvided + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + } + } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap new file mode 100644 index 000000000..718ea229c --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap @@ -0,0 +1,19 @@ +--- +source: index-scheduler/src/lib.rs +expression: doc +--- +{ + "doggo": "kefir", + "breed": "patou", + "_vectors": { + "A_fakerest": { + "embeddings": "[vector]", + "userProvided": true + }, + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap new file mode 100644 index 000000000..bc16fc8be --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap @@ -0,0 +1,20 @@ +--- +source: index-scheduler/src/lib.rs +expression: task.details +--- +{ + "embedders": { + "A_fakerest": { + "source": "rest", + "apiKey": "MyXXXX...", + "dimensions": 384, + "url": "http://localhost:7777" + }, + "B_small_hf": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}} the {{doc.breed}} best doggo" + } + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap new file mode 100644 index 000000000..013115a58 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap @@ -0,0 +1,23 @@ +--- +source: index-scheduler/src/lib.rs +expression: fakerest_config.embedder_options +--- +{ + "Rest": { + "api_key": "My super secret", + "distribution": null, + "dimensions": 384, + "url": "http://localhost:7777", + "query": null, + "input_field": [ + "input" + ], + "path_to_embeddings": [ + "data" + ], + "embedding_object": [ + "embedding" + ], + "input_type": "text" + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap new file mode 100644 index 000000000..712a62c77 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap @@ -0,0 +1,11 @@ +--- +source: index-scheduler/src/lib.rs +expression: simple_hf_config.embedder_options +--- +{ + "HuggingFace": { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": null + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap new file mode 100644 index 000000000..002a42e59 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap @@ -0,0 +1,19 @@ +--- +source: index-scheduler/src/lib.rs +expression: doc +--- +{ + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + "A_fakerest": { + "embeddings": "[vector]", + "userProvided": true + }, + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap new file mode 100644 index 000000000..bc16fc8be --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap @@ -0,0 +1,20 @@ +--- +source: index-scheduler/src/lib.rs +expression: task.details +--- +{ + "embedders": { + "A_fakerest": { + "source": "rest", + "apiKey": "MyXXXX...", + "dimensions": 384, + "url": "http://localhost:7777" + }, + "B_small_hf": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}} the {{doc.breed}} best doggo" + } + } +} diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap new file mode 100644 index 000000000..6b285ba56 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -0,0 +1,49 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,2,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,2,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,2,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap new file mode 100644 index 000000000..6f23d96fd --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -0,0 +1,48 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [2,] +succeeded [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,2,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,2,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000001 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap new file mode 100644 index 000000000..5dcb5a4f7 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -0,0 +1,45 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap new file mode 100644 index 000000000..80521df42 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -0,0 +1,44 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [1,] +succeeded [0,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap new file mode 100644 index 000000000..97b669f44 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -0,0 +1,36 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap new file mode 100644 index 000000000..f3ce4b104 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -0,0 +1,40 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,] +---------------------------------------------------------------------- +### Kind: +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + From 7e251b43d41b4f44ff597a02ea1d0f759c42989e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 20 May 2024 15:09:45 +0200 Subject: [PATCH 39/56] Revert "Stream documents" --- Cargo.lock | 38 +++--- index-scheduler/src/batch.rs | 10 +- index-scheduler/src/lib.rs | 14 +-- meilisearch-auth/src/store.rs | 2 +- meilisearch-types/src/error.rs | 1 + meilisearch/Cargo.toml | 1 - meilisearch/src/routes/indexes/documents.rs | 116 +++++------------- meilisearch/src/routes/mod.rs | 28 ++--- meilitool/src/main.rs | 8 +- milli/Cargo.toml | 4 +- milli/fuzz/.gitignore | 3 - milli/src/error.rs | 3 + milli/src/index.rs | 7 +- milli/src/update/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 3 +- 16 files changed, 91 insertions(+), 151 deletions(-) delete mode 100644 milli/fuzz/.gitignore diff --git a/Cargo.lock b/Cargo.lock index d9e96b029..937fce64a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.3.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" +checksum = "efddeb1e7c32a551cc07ef4c3e181e3cd5478fdaf4f0bd799983171c1f6efe57" dependencies = [ "bytemuck", "byteorder", @@ -1536,9 +1536,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "doxygen-rs" -version = "0.4.2" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9" +checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" dependencies = [ "phf", ] @@ -2262,11 +2262,12 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.1" +version = "0.20.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" +checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" dependencies = [ "bitflags 2.5.0", + "bytemuck", "byteorder", "heed-traits", "heed-types", @@ -2280,15 +2281,15 @@ dependencies = [ [[package]] name = "heed-traits" -version = "0.20.0" +version = "0.20.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" +checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" [[package]] name = "heed-types" -version = "0.20.0" +version = "0.20.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb0d6ba3700c9a57e83c013693e3eddb68a6d9b6781cacafc62a0d992e8ddb3" +checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" dependencies = [ "bincode", "byteorder", @@ -3188,13 +3189,14 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.2.0" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" +checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" dependencies = [ "cc", "doxygen-rs", "libc", + "pkg-config", ] [[package]] @@ -3346,7 +3348,6 @@ dependencies = [ "rayon", "regex", "reqwest", - "roaring", "rustls 0.21.12", "rustls-pemfile", "segment", @@ -4415,6 +4416,12 @@ dependencies = [ "winreg", ] +[[package]] +name = "retain_mut" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" + [[package]] name = "ring" version = "0.17.8" @@ -4432,12 +4439,13 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.3" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1c77081a55300e016cb86f2864415b7518741879db925b8d488a0ee0d2da6bf" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", + "retain_mut", "serde", ] diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 582497c15..bc9823a01 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -785,12 +785,10 @@ impl IndexScheduler { let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should - let auth = unsafe { - milli::heed::EnvOpenOptions::new() - .map_size(1024 * 1024 * 1024) // 1 GiB - .max_dbs(2) - .open(&self.auth_path) - }?; + let auth = milli::heed::EnvOpenOptions::new() + .map_size(1024 * 1024 * 1024) // 1 GiB + .max_dbs(2) + .open(&self.auth_path)?; auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index dd2b296f6..5704f5354 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -453,12 +453,10 @@ impl IndexScheduler { ) }; - let env = unsafe { - heed::EnvOpenOptions::new() - .max_dbs(11) - .map_size(budget.task_db_size) - .open(options.tasks_path) - }?; + let env = heed::EnvOpenOptions::new() + .max_dbs(11) + .map_size(budget.task_db_size) + .open(options.tasks_path)?; let features = features::FeatureData::new(&env, options.instance_features)?; @@ -587,9 +585,9 @@ impl IndexScheduler { } fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool { - if let Ok(env) = unsafe { + if let Ok(env) = heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path) - } { + { env.prepare_for_closing().wait(); true } else { diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index ef992e836..1eebd3fe9 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -49,7 +49,7 @@ pub fn open_auth_store_env(path: &Path) -> milli::heed::Result let mut options = EnvOpenOptions::new(); options.map_size(AUTH_STORE_SIZE); // 1GB options.max_dbs(2); - unsafe { options.open(path) } + options.open(path) } impl HeedAuthStore { diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 158dfae92..eea012331 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -423,6 +423,7 @@ impl ErrorCode for HeedError { HeedError::Mdb(_) | HeedError::Encoding(_) | HeedError::Decoding(_) + | HeedError::InvalidDatabaseTyping | HeedError::DatabaseClosing | HeedError::BadOpenOptions { .. } => Code::Internal, } diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 612c6731b..ed62c5f48 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -108,7 +108,6 @@ tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.9" build-info = { version = "1.7.0", path = "../build-info" } -roaring = "0.10.3" [dev-dependencies] actix-rt = "2.9.0" diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 7c9b4b761..43fab1dae 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,14 +1,12 @@ -use std::io::{ErrorKind, Write}; +use std::io::ErrorKind; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; use actix_web::{web, HttpMessage, HttpRequest, HttpResponse}; use bstr::ByteSlice as _; -use bytes::Bytes; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::Deserr; use futures::StreamExt; -use futures_util::Stream; use index_scheduler::{IndexScheduler, TaskId}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; @@ -24,9 +22,7 @@ use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; -use roaring::RoaringBitmap; -use serde::ser::SerializeSeq; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; @@ -234,34 +230,6 @@ pub async fn get_documents( documents_by_query(&index_scheduler, index_uid, query) } -pub struct Writer2Streamer { - sender: tokio::sync::mpsc::Sender>, -} - -impl Write for Writer2Streamer { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.sender.blocking_send(Ok(buf.to_vec().into())).map_err(std::io::Error::other)?; - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -pub fn stream( - data: impl Serialize + Send + 'static, -) -> impl Stream> { - let (sender, receiver) = tokio::sync::mpsc::channel::>(1); - - tokio::task::spawn_blocking(move || { - serde_json::to_writer(std::io::BufWriter::new(Writer2Streamer { sender }), &data) - }); - futures_util::stream::unfold(receiver, |mut receiver| async { - receiver.recv().await.map(|value| (value, receiver)) - }) -} - fn documents_by_query( index_scheduler: &IndexScheduler, index_uid: web::Path, @@ -271,13 +239,12 @@ fn documents_by_query( let BrowseQuery { offset, limit, fields, filter } = query; let index = index_scheduler.index(&index_uid)?; - let documents = retrieve_documents(index, offset, limit, filter, fields)?; + let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; - let ret = PaginationView::new(offset, limit, documents.total_documents as usize, documents); + let ret = PaginationView::new(offset, limit, total as usize, documents); debug!(returns = ?ret, "Get documents"); - - Ok(HttpResponse::Ok().streaming(stream(ret))) + Ok(HttpResponse::Ok().json(ret)) } #[derive(Deserialize, Debug, Deserr)] @@ -623,47 +590,14 @@ fn some_documents<'a, 't: 'a>( })) } -pub struct DocumentsStreamer { - attributes_to_retrieve: Option>, - documents: RoaringBitmap, - rtxn: RoTxn<'static>, - index: Index, - pub total_documents: u64, -} - -impl Serialize for DocumentsStreamer { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let mut seq = serializer.serialize_seq(Some(self.documents.len() as usize)).unwrap(); - - let documents = some_documents(&self.index, &self.rtxn, self.documents.iter()).unwrap(); - for document in documents { - let document = document.unwrap(); - let document = match self.attributes_to_retrieve { - Some(ref attributes_to_retrieve) => permissive_json_pointer::select_values( - &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), - ), - None => document, - }; - - seq.serialize_element(&document)?; - } - seq.end() - } -} - -fn retrieve_documents( - index: Index, +fn retrieve_documents>( + index: &Index, offset: usize, limit: usize, filter: Option, - attributes_to_retrieve: Option>, -) -> Result { - let rtxn = index.static_read_txn()?; - + attributes_to_retrieve: Option>, +) -> Result<(u64, Vec), ResponseError> { + let rtxn = index.read_txn()?; let filter = &filter; let filter = if let Some(filter) = filter { parse_filter(filter) @@ -673,7 +607,7 @@ fn retrieve_documents( }; let candidates = if let Some(filter) = filter { - filter.evaluate(&rtxn, &index).map_err(|err| match err { + filter.evaluate(&rtxn, index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter) } @@ -683,13 +617,27 @@ fn retrieve_documents( index.documents_ids(&rtxn)? }; - Ok(DocumentsStreamer { - total_documents: candidates.len(), - attributes_to_retrieve, - documents: candidates.into_iter().skip(offset).take(limit).collect(), - rtxn, - index, - }) + let (it, number_of_documents) = { + let number_of_documents = candidates.len(); + ( + some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + number_of_documents, + ) + }; + + let documents: Result, ResponseError> = it + .map(|document| { + Ok(match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document?, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document?, + }) + }) + .collect(); + + Ok((number_of_documents, documents?)) } fn retrieve_document>( diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index a7e84d19c..c25aeee70 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::fmt; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; @@ -125,31 +124,20 @@ pub struct Pagination { pub limit: usize, } -#[derive(Clone, Serialize)] -pub struct PaginationView { - pub results: T, +#[derive(Debug, Clone, Serialize)] +pub struct PaginationView { + pub results: Vec, pub offset: usize, pub limit: usize, pub total: usize, } -impl fmt::Debug for PaginationView { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("PaginationView") - .field("offset", &self.offset) - .field("limit", &self.limit) - .field("total", &self.total) - .field("results", &"[...]") - .finish() - } -} - impl Pagination { /// Given the full data to paginate, returns the selected section. pub fn auto_paginate_sized( self, content: impl IntoIterator + ExactSizeIterator, - ) -> PaginationView> + ) -> PaginationView where T: Serialize, { @@ -163,7 +151,7 @@ impl Pagination { self, total: usize, content: impl IntoIterator, - ) -> PaginationView> + ) -> PaginationView where T: Serialize, { @@ -173,7 +161,7 @@ impl Pagination { /// Given the data already paginated + the total number of elements, it stores /// everything in a [PaginationResult]. - pub fn format_with(self, total: usize, results: Vec) -> PaginationView> + pub fn format_with(self, total: usize, results: Vec) -> PaginationView where T: Serialize, { @@ -181,8 +169,8 @@ impl Pagination { } } -impl PaginationView { - pub fn new(offset: usize, limit: usize, total: usize, results: T) -> Self { +impl PaginationView { + pub fn new(offset: usize, limit: usize, total: usize, results: Vec) -> Self { Self { offset, limit, results, total } } } diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index 06c4890a5..bfcbfdd6d 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -80,7 +80,9 @@ fn main() -> anyhow::Result<()> { /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) } + let env = EnvOpenOptions::new() + .max_dbs(100) + .open(&path) .with_context(|| format!("While trying to open {:?}", path.display()))?; eprintln!("Deleting tasks from the database..."); @@ -191,7 +193,9 @@ fn export_a_dump( FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + let env = EnvOpenOptions::new() + .max_dbs(100) + .open(&index_scheduler_path) .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; eprintln!("Dumping the keys..."); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c5dddd0fd..7d903178b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,7 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } -heed = { version = "0.20.1", default-features = false, features = [ +heed = { version = "0.20.0-alpha.9", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", @@ -82,7 +82,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.3.1" +arroy = "0.2.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore deleted file mode 100644 index a0925114d..000000000 --- a/milli/fuzz/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -target -corpus -artifacts diff --git a/milli/src/error.rs b/milli/src/error.rs index 6db0dcac1..009781fcf 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -48,6 +48,8 @@ pub enum InternalError { GrenadInvalidFormatVersion, #[error("Invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, + #[error("{}", HeedError::InvalidDatabaseTyping)] + InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), #[error(transparent)] @@ -427,6 +429,7 @@ impl From for Error { // TODO use the encoding HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), + HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), } diff --git a/milli/src/index.rs b/milli/src/index.rs index 739a7f202..42b9cb111 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -184,7 +184,7 @@ impl Index { options.max_dbs(25); - let env = unsafe { options.open(path) }?; + let env = options.open(path)?; let mut wtxn = env.write_txn()?; let main = env.database_options().name(MAIN).create(&mut wtxn)?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; @@ -294,11 +294,6 @@ impl Index { self.env.read_txn() } - /// Create a static read transaction to be able to read the index without keeping a reference to it. - pub fn static_read_txn(&self) -> heed::Result> { - self.env.clone().static_read_txn() - } - /// Returns the canonicalized path where the heed `Env` of this `Index` lives. pub fn path(&self) -> &Path { self.env.path() diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 42994551f..0af64c4c5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -379,7 +379,7 @@ pub(crate) mod test_helpers { let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); - let env = unsafe { options.open(tempdir.path()) }.unwrap(); + let env = options.open(tempdir.path()).unwrap(); let mut wtxn = env.write_txn().unwrap(); let content = env.create_database(&mut wtxn, None).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4d2fac7cb..936ce1efc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -556,7 +556,7 @@ where let writer_index = (embedder_index as u16) << 8; for k in 0..=u8::MAX { let writer = - arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension); + arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?; if writer.is_empty(wtxn)? { break; } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e0de2d5a1..6aad290e5 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -661,7 +661,7 @@ pub(crate) fn write_typed_chunk_into_index( )?; let writer_index = (embedder_index as u16) << 8; // FIXME: allow customizing distance - let writers: Vec<_> = (0..=u8::MAX) + let writers: std::result::Result, _> = (0..=u8::MAX) .map(|k| { arroy::Writer::new( index.vector_arroy, @@ -670,6 +670,7 @@ pub(crate) fn write_typed_chunk_into_index( ) }) .collect(); + let writers = writers?; // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); From c9ac7f2e7e02ac927bba70b7cfeaa020f3e60534 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 16:10:55 +0200 Subject: [PATCH 40/56] update heed to latest version --- Cargo.lock | 26 +++++++++---------- index-scheduler/src/batch.rs | 10 ++++--- index-scheduler/src/lib.rs | 14 +++++----- meilisearch-auth/src/store.rs | 2 +- meilisearch-types/src/error.rs | 1 - meilitool/src/main.rs | 8 ++---- milli/Cargo.toml | 4 +-- milli/fuzz/.gitignore | 3 +++ milli/src/error.rs | 3 --- milli/src/index.rs | 7 ++++- milli/src/update/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 3 +-- 13 files changed, 43 insertions(+), 42 deletions(-) create mode 100644 milli/fuzz/.gitignore diff --git a/Cargo.lock b/Cargo.lock index 937fce64a..156917462 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.2.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efddeb1e7c32a551cc07ef4c3e181e3cd5478fdaf4f0bd799983171c1f6efe57" +checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" dependencies = [ "bytemuck", "byteorder", @@ -1536,9 +1536,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "doxygen-rs" -version = "0.2.2" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" +checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9" dependencies = [ "phf", ] @@ -2262,12 +2262,11 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.0-alpha.9" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" +checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" dependencies = [ "bitflags 2.5.0", - "bytemuck", "byteorder", "heed-traits", "heed-types", @@ -2281,15 +2280,15 @@ dependencies = [ [[package]] name = "heed-traits" -version = "0.20.0-alpha.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" +checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" [[package]] name = "heed-types" -version = "0.20.0-alpha.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" +checksum = "3cb0d6ba3700c9a57e83c013693e3eddb68a6d9b6781cacafc62a0d992e8ddb3" dependencies = [ "bincode", "byteorder", @@ -3189,14 +3188,13 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" +checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" dependencies = [ "cc", "doxygen-rs", "libc", - "pkg-config", ] [[package]] diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index bc9823a01..582497c15 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -785,10 +785,12 @@ impl IndexScheduler { let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should - let auth = milli::heed::EnvOpenOptions::new() - .map_size(1024 * 1024 * 1024) // 1 GiB - .max_dbs(2) - .open(&self.auth_path)?; + let auth = unsafe { + milli::heed::EnvOpenOptions::new() + .map_size(1024 * 1024 * 1024) // 1 GiB + .max_dbs(2) + .open(&self.auth_path) + }?; auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5704f5354..dd2b296f6 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -453,10 +453,12 @@ impl IndexScheduler { ) }; - let env = heed::EnvOpenOptions::new() - .max_dbs(11) - .map_size(budget.task_db_size) - .open(options.tasks_path)?; + let env = unsafe { + heed::EnvOpenOptions::new() + .max_dbs(11) + .map_size(budget.task_db_size) + .open(options.tasks_path) + }?; let features = features::FeatureData::new(&env, options.instance_features)?; @@ -585,9 +587,9 @@ impl IndexScheduler { } fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool { - if let Ok(env) = + if let Ok(env) = unsafe { heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path) - { + } { env.prepare_for_closing().wait(); true } else { diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 1eebd3fe9..ef992e836 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -49,7 +49,7 @@ pub fn open_auth_store_env(path: &Path) -> milli::heed::Result let mut options = EnvOpenOptions::new(); options.map_size(AUTH_STORE_SIZE); // 1GB options.max_dbs(2); - options.open(path) + unsafe { options.open(path) } } impl HeedAuthStore { diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index eea012331..158dfae92 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -423,7 +423,6 @@ impl ErrorCode for HeedError { HeedError::Mdb(_) | HeedError::Encoding(_) | HeedError::Decoding(_) - | HeedError::InvalidDatabaseTyping | HeedError::DatabaseClosing | HeedError::BadOpenOptions { .. } => Code::Internal, } diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index bfcbfdd6d..06c4890a5 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -80,9 +80,7 @@ fn main() -> anyhow::Result<()> { /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) } .with_context(|| format!("While trying to open {:?}", path.display()))?; eprintln!("Deleting tasks from the database..."); @@ -193,9 +191,7 @@ fn export_a_dump( FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; let index_scheduler_path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&index_scheduler_path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; eprintln!("Dumping the keys..."); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7d903178b..c5dddd0fd 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,7 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } -heed = { version = "0.20.0-alpha.9", default-features = false, features = [ +heed = { version = "0.20.1", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", @@ -82,7 +82,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.2.0" +arroy = "0.3.1" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore new file mode 100644 index 000000000..a0925114d --- /dev/null +++ b/milli/fuzz/.gitignore @@ -0,0 +1,3 @@ +target +corpus +artifacts diff --git a/milli/src/error.rs b/milli/src/error.rs index 009781fcf..6db0dcac1 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -48,8 +48,6 @@ pub enum InternalError { GrenadInvalidFormatVersion, #[error("Invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, - #[error("{}", HeedError::InvalidDatabaseTyping)] - InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), #[error(transparent)] @@ -429,7 +427,6 @@ impl From for Error { // TODO use the encoding HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), - HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), } diff --git a/milli/src/index.rs b/milli/src/index.rs index 42b9cb111..739a7f202 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -184,7 +184,7 @@ impl Index { options.max_dbs(25); - let env = options.open(path)?; + let env = unsafe { options.open(path) }?; let mut wtxn = env.write_txn()?; let main = env.database_options().name(MAIN).create(&mut wtxn)?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; @@ -294,6 +294,11 @@ impl Index { self.env.read_txn() } + /// Create a static read transaction to be able to read the index without keeping a reference to it. + pub fn static_read_txn(&self) -> heed::Result> { + self.env.clone().static_read_txn() + } + /// Returns the canonicalized path where the heed `Env` of this `Index` lives. pub fn path(&self) -> &Path { self.env.path() diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 0af64c4c5..42994551f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -379,7 +379,7 @@ pub(crate) mod test_helpers { let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); + let env = unsafe { options.open(tempdir.path()) }.unwrap(); let mut wtxn = env.write_txn().unwrap(); let content = env.create_database(&mut wtxn, None).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 936ce1efc..4d2fac7cb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -556,7 +556,7 @@ where let writer_index = (embedder_index as u16) << 8; for k in 0..=u8::MAX { let writer = - arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?; + arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension); if writer.is_empty(wtxn)? { break; } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6aad290e5..e0de2d5a1 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -661,7 +661,7 @@ pub(crate) fn write_typed_chunk_into_index( )?; let writer_index = (embedder_index as u16) << 8; // FIXME: allow customizing distance - let writers: std::result::Result, _> = (0..=u8::MAX) + let writers: Vec<_> = (0..=u8::MAX) .map(|k| { arroy::Writer::new( index.vector_arroy, @@ -670,7 +670,6 @@ pub(crate) fn write_typed_chunk_into_index( ) }) .collect(); - let writers = writers?; // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); From 1aa8ed9ef7bc02fe805e77e2feee4b81031acb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 21 May 2024 14:53:26 +0200 Subject: [PATCH 41/56] Make the original sorter optional --- milli/src/update/index_documents/mod.rs | 22 +++++++++-- milli/src/update/index_documents/transform.rs | 38 ++++++++++++------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4d2fac7cb..cceb25338 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -6,6 +6,7 @@ mod typed_chunk; use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; +use std::iter; use std::num::NonZeroU32; use std::result::Result as StdResult; use std::sync::Arc; @@ -373,8 +374,11 @@ where } }; - let original_documents = grenad::Reader::new(original_documents)?; let flattened_documents = grenad::Reader::new(flattened_documents)?; + let original_documents = match original_documents { + Some(original_documents) => Some(grenad::Reader::new(original_documents)?), + None => None, + }; let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; @@ -393,11 +397,21 @@ where pool.install(|| { rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); - let _enter = child_span.enter(); - puffin::profile_scope!("extract_and_send_grenad_chunks"); + let _enter = child_span.enter(); + puffin::profile_scope!("extract_and_send_grenad_chunks"); // split obkv file into several chunks let original_chunk_iter = - grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + match original_documents { + Some(original_documents) => { + grenad_obkv_into_chunks( + original_documents, + pool_params, + documents_chunk_size + ) + .map(either::Either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; // split obkv file into several chunks let flattened_chunk_iter = diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8a3463e6f..f7e3d79fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -33,7 +33,7 @@ pub struct TransformOutput { pub settings_diff: InnerIndexSettingsDiff, pub field_distribution: FieldDistribution, pub documents_count: usize, - pub original_documents: File, + pub original_documents: Option, pub flattened_documents: File, } @@ -822,7 +822,9 @@ impl<'a, 'i> Transform<'a, 'i> { settings_diff, field_distribution, documents_count: self.documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, + original_documents: Some( + original_documents.into_inner().map_err(|err| err.into_error())?, + ), flattened_documents: flattened_documents .into_inner() .map_err(|err| err.into_error())?, @@ -891,14 +893,18 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We initialize the sorter with the user indexing settings. - let mut original_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - keep_first, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory.map(|mem| mem / 2), - ); + let mut original_sorter = if settings_diff.reindex_vectors() { + Some(create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), + )) + } else { + None + }; // We initialize the sorter with the user indexing settings. let mut flattened_sorter = create_sorter( @@ -929,7 +935,9 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_key_buffer.clear(); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + if let Some(original_sorter) = original_sorter.as_mut() { + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + } flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; } @@ -941,16 +949,18 @@ impl<'a, 'i> Transform<'a, 'i> { }; // Once we have written all the documents, we merge everything into a Reader. - let original_documents = sorter_into_reader(original_sorter, grenad_params)?; - let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; + let original_documents = match original_sorter { + Some(original_sorter) => Some(sorter_into_reader(original_sorter, grenad_params)?), + None => None, + }; Ok(TransformOutput { primary_key, field_distribution, settings_diff, documents_count, - original_documents: original_documents.into_inner().into_inner(), + original_documents: original_documents.map(|od| od.into_inner().into_inner()), flattened_documents: flattened_documents.into_inner().into_inner(), }) } From 943f8dba0c97bd96f1a325abfbf5d76833e35c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 21 May 2024 14:58:36 +0200 Subject: [PATCH 42/56] Make clippy happy --- .../src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index dcab42c0a..1db518c7d 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -195,7 +195,7 @@ mod tests { fn merge_cbo_roaring_bitmaps() { let mut buffer = Vec::new(); - let small_data = vec![ + let small_data = [ RoaringBitmap::from_sorted_iter(1..4).unwrap(), RoaringBitmap::from_sorted_iter(2..5).unwrap(), RoaringBitmap::from_sorted_iter(4..6).unwrap(), @@ -209,7 +209,7 @@ mod tests { let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap(); assert_eq!(bitmap, expected); - let medium_data = vec![ + let medium_data = [ RoaringBitmap::from_sorted_iter(1..4).unwrap(), RoaringBitmap::from_sorted_iter(2..5).unwrap(), RoaringBitmap::from_sorted_iter(4..8).unwrap(), From eccbcf51300277a81eeb43678855bbe1299a65e2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 21 May 2024 14:59:08 +0200 Subject: [PATCH 43/56] Increase index-scheduler test timeouts --- index-scheduler/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f743422a7..e4c9cd08f 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1852,7 +1852,7 @@ mod tests { // To be 100% consistent between all test we're going to start the scheduler right now // and ensure it's in the expected starting state. - let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(1)) { + let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(10)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") @@ -1963,7 +1963,7 @@ mod tests { fn advance(&mut self) -> Breakpoint { let (breakpoint_1, b) = match self .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(5)) + .recv_timeout(std::time::Duration::from_secs(50)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { @@ -1984,7 +1984,7 @@ mod tests { let (breakpoint_2, b) = match self .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(5)) + .recv_timeout(std::time::Duration::from_secs(50)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { From 500ddc76b549fb9f1af54b2dd6abfa15960381bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 21 May 2024 16:16:36 +0200 Subject: [PATCH 44/56] Make the flattened sorter optional --- milli/src/update/index_documents/mod.rs | 36 +++++++++------- milli/src/update/index_documents/transform.rs | 43 ++++++++++++------- milli/src/update/settings.rs | 1 + 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index cceb25338..dccfbe795 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -360,7 +360,10 @@ where let min_chunk_size = 1024 * 512; // 512KiB // compute the chunk size from the number of available threads and the inputed data size. - let total_size = flattened_documents.metadata().map(|m| m.len()); + let total_size = match flattened_documents.as_ref() { + Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()), + None => Ok(default_chunk_size as u64), + }; let current_num_threads = pool.current_num_threads(); // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count let chunk_count = if current_num_threads > 2 { @@ -374,11 +377,14 @@ where } }; - let flattened_documents = grenad::Reader::new(flattened_documents)?; let original_documents = match original_documents { Some(original_documents) => Some(grenad::Reader::new(original_documents)?), None => None, }; + let flattened_documents = match flattened_documents { + Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?), + None => None, + }; let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; @@ -400,22 +406,20 @@ where let _enter = child_span.enter(); puffin::profile_scope!("extract_and_send_grenad_chunks"); // split obkv file into several chunks - let original_chunk_iter = - match original_documents { - Some(original_documents) => { - grenad_obkv_into_chunks( - original_documents, - pool_params, - documents_chunk_size - ) - .map(either::Either::Left) - }, - None => Ok(either::Right(iter::empty())), - }; + let original_chunk_iter = match original_documents { + Some(original_documents) => { + grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; // split obkv file into several chunks - let flattened_chunk_iter = - grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); + let flattened_chunk_iter = match flattened_documents { + Some(flattened_documents) => { + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; let result = original_chunk_iter.and_then(|original_chunk| { let flattened_chunk = flattened_chunk_iter?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f7e3d79fd..8bedd778e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -34,7 +34,7 @@ pub struct TransformOutput { pub field_distribution: FieldDistribution, pub documents_count: usize, pub original_documents: Option, - pub flattened_documents: File, + pub flattened_documents: Option, } /// Extract the external ids, deduplicate and compute the new internal documents ids @@ -825,9 +825,9 @@ impl<'a, 'i> Transform<'a, 'i> { original_documents: Some( original_documents.into_inner().map_err(|err| err.into_error())?, ), - flattened_documents: flattened_documents - .into_inner() - .map_err(|err| err.into_error())?, + flattened_documents: Some( + flattened_documents.into_inner().map_err(|err| err.into_error())?, + ), }) } @@ -840,6 +840,9 @@ impl<'a, 'i> Transform<'a, 'i> { original_obkv_buffer: &mut Vec, flattened_obkv_buffer: &mut Vec, ) -> Result<()> { + /// TODO do a XOR of the faceted fields + /// TODO if reindex_searchable returns true store all searchables else none + /// TODO no longer useful after Tamo's PR let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); @@ -907,14 +910,19 @@ impl<'a, 'i> Transform<'a, 'i> { }; // We initialize the sorter with the user indexing settings. - let mut flattened_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - keep_first, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory.map(|mem| mem / 2), - ); + let mut flattened_sorter = + if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { + Some(create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), + )) + } else { + None + }; let mut original_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); @@ -938,7 +946,9 @@ impl<'a, 'i> Transform<'a, 'i> { if let Some(original_sorter) = original_sorter.as_mut() { original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; } - flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + if let Some(flattened_sorter) = flattened_sorter.as_mut() { + flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + } } let grenad_params = GrenadParameters { @@ -949,7 +959,10 @@ impl<'a, 'i> Transform<'a, 'i> { }; // Once we have written all the documents, we merge everything into a Reader. - let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; + let flattened_documents = match flattened_sorter { + Some(flattened_sorter) => Some(sorter_into_reader(flattened_sorter, grenad_params)?), + None => None, + }; let original_documents = match original_sorter { Some(original_sorter) => Some(sorter_into_reader(original_sorter, grenad_params)?), None => None, @@ -961,7 +974,7 @@ impl<'a, 'i> Transform<'a, 'i> { settings_diff, documents_count, original_documents: original_documents.map(|od| od.into_inner().into_inner()), - flattened_documents: flattened_documents.into_inner().into_inner(), + flattened_documents: flattened_documents.map(|fd| fd.into_inner().into_inner()), }) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0599bb9d8..c7d6ff0fd 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1099,6 +1099,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { + // TODO no longer useful after Tamo's PR self.old .fields_ids_map .iter() From 8f7c8ca7f0bf01b234b8f9d7d2435166a79f56bd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 22 May 2024 12:23:43 +0200 Subject: [PATCH 45/56] Remove now unused error variant --- meilisearch-types/src/error.rs | 1 - milli/src/error.rs | 2 -- 2 files changed, 3 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 158dfae92..85a2cd767 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -384,7 +384,6 @@ impl ErrorCode for milli::Error { UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType, - UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::SortError(_) => Code::InvalidSearchSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { diff --git a/milli/src/error.rs b/milli/src/error.rs index e60252ec1..83754afe4 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -117,8 +117,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidGeoField(#[from] GeoError), #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)] InvalidVectorDimensions { expected: usize, found: usize }, - #[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")] - InvalidVectorsType { document_id: Value, value: Value, subfield: String }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, #[error("{0}")] From 16037e21692a06c9ec7bacc2d4983cd00d91360c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 22 May 2024 12:24:51 +0200 Subject: [PATCH 46/56] Don't remove embedders that are not in the config from the document DB --- milli/src/update/index_documents/typed_chunk.rs | 15 +++++++++------ milli/src/vector/parsed_vectors.rs | 12 ++++++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6615a4bc3..2345551ab 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -211,6 +211,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; + let embedders: BTreeSet<_> = + index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -225,9 +227,8 @@ pub(crate) fn write_typed_chunk_into_index( let del_add_reader = KvReaderDelAdd::new(value); if let Some(addition) = del_add_reader.get(DelAdd::Addition) { - let addition = match vectors_fid { - // for the "_vectors" field, only keep vectors that are marked as userProvided - Some(vectors_fid) if vectors_fid == field_id => 'vectors: { + let addition = if vectors_fid == Some(field_id) { + 'vectors: { vectors_buffer.clear(); let Ok(mut vectors) = crate::vector::parsed_vectors::ParsedVectors::from_bytes( @@ -237,7 +238,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(); + vectors.retain_user_provided_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -248,8 +249,10 @@ pub(crate) fn write_typed_chunk_into_index( .map_err(InternalError::SerdeJson)?; Some(vectors_buffer.as_slice()) } - _ => Some(addition), + } else { + Some(addition) }; + if let Some(addition) = addition { writer.insert(field_id, addition)?; } diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 4e06177de..da67ccc83 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use obkv::KvReader; use serde_json::{from_slice, Value}; @@ -89,10 +89,14 @@ impl ParsedVectors { Ok(ParsedVectors(value)) } - pub fn retain_user_provided_vectors(&mut self) { - self.0.retain(|_k, v| match v { + pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, v| match v { Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => *user_provided, + Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { + *user_provided + // if the embedder is not in the config, then never touch it + || !embedders.contains(k) + } }); } } From 3412e7fbcfd9fdd4238741152b927a06ce0b3df5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 22 May 2024 12:25:21 +0200 Subject: [PATCH 47/56] "[]" is deserialized as 0 embedding rather than 1 embedding of dim 0 --- milli/src/vector/parsed_vectors.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index da67ccc83..2c61baa9e 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -136,19 +136,19 @@ fn to_vector_map( #[serde(transparent)] pub struct VectorOrArrayOfVectors { #[serde(with = "either::serde_untagged_optional")] - inner: Option>>, + inner: Option, Embedding>>, } impl VectorOrArrayOfVectors { pub fn into_array_of_vectors(self) -> Option> { match self.inner? { - either::Either::Left(vector) => Some(vec![vector]), - either::Either::Right(vectors) => Some(vectors), + either::Either::Left(vectors) => Some(vectors), + either::Either::Right(vector) => Some(vec![vector]), } } pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { - Self { inner: Some(either::Either::Right(array_of_vec)) } + Self { inner: Some(either::Either::Left(array_of_vec)) } } } @@ -167,12 +167,7 @@ mod test { serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); - // 👇 is the the intended behavior? would rather expect [] here, but changing that is a breaking change... - insta::assert_json_snapshot!(empty.into_array_of_vectors(), @r###" - [ - [] - ] - "###); + insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]"); insta::assert_json_snapshot!(one.into_array_of_vectors(), @r###" [ [ From 8a941c0241cb62956662f9725782b5a1db132339 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 22 May 2024 12:26:00 +0200 Subject: [PATCH 48/56] Smaller review changes --- index-scheduler/src/batch.rs | 17 ++++++++++------- milli/src/index.rs | 4 ++-- .../extract/extract_vector_points.rs | 4 +--- milli/src/update/index_documents/extract/mod.rs | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 1f5ec76b9..d10f83a0a 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -31,7 +31,9 @@ use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; -use meilisearch_types::milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::vector::parsed_vectors::{ + ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, +}; use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; @@ -955,12 +957,13 @@ impl IndexScheduler { for (embedder_name, embeddings) in embeddings { // don't change the entry if it already exists, because it was user-provided vectors.entry(embedder_name).or_insert_with(|| { - - let embeddings = milli::vector::parsed_vectors::ExplicitVectors { - embeddings: milli::vector::parsed_vectors::VectorOrArrayOfVectors::from_array_of_vectors(embeddings), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() + let embeddings = ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + ), + user_provided: false, + }; + serde_json::to_value(embeddings).unwrap() }); } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 66cd6f3cc..982be0139 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,7 +23,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::EmbeddingConfig; +use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -1611,7 +1611,7 @@ impl Index { &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result>> { + ) -> Result>> { let mut res = BTreeMap::new(); for row in self.embedder_category_id.iter(rtxn)? { let (embedder_name, embedder_id) = row?; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 8b78a8c55..724d9ea81 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -89,7 +89,7 @@ struct EmbedderVectorExtractor { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - settings_diff: Arc, + settings_diff: &InnerIndexSettingsDiff, ) -> Result> { puffin::profile_function!(); @@ -258,8 +258,6 @@ pub fn extract_vector_points( } } - ///// - let mut results = Vec::new(); for EmbedderVectorExtractor { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0ea0fcc5c..7598c8094 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -241,7 +241,7 @@ fn send_original_documents_data( let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - match extract_vector_points(original_documents_chunk.clone(), indexer, settings_diff) { + match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { Ok(extracted_vectors) => { for ExtractedVectorPoints { manual_vectors, From bc5663e673ccb4f364fb384b5562fda0c1521416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 22 May 2024 16:06:15 +0200 Subject: [PATCH 49/56] FieldIdsMap no longer useful thanks to #4631 --- milli/src/update/index_documents/transform.rs | 6 +++--- milli/src/update/settings.rs | 10 ++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8bedd778e..aef4d1583 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -840,9 +840,9 @@ impl<'a, 'i> Transform<'a, 'i> { original_obkv_buffer: &mut Vec, flattened_obkv_buffer: &mut Vec, ) -> Result<()> { - /// TODO do a XOR of the faceted fields - /// TODO if reindex_searchable returns true store all searchables else none - /// TODO no longer useful after Tamo's PR + // TODO do a XOR of the faceted fields + // TODO if reindex_searchable returns true store all searchables else none + // TODO no longer useful after Tamo's PR let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c7d6ff0fd..1529e1fe6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1099,14 +1099,8 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - // TODO no longer useful after Tamo's PR - self.old - .fields_ids_map - .iter() - .zip(self.new.fields_ids_map.iter()) - .any(|(old, new)| old != new) - || self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) || self.old.allowed_separators != self.new.allowed_separators || self.old.dictionary != self.new.dictionary || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields From fe17c0f52e22b30fb6aec06fb233fbd4afbccf8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 22 May 2024 16:05:55 +0200 Subject: [PATCH 50/56] Construct the minimal OBKVs according to the settings diff --- meilisearch/tests/search/mod.rs | 20 +++ milli/src/lib.rs | 3 +- milli/src/update/index_documents/transform.rs | 137 +++++++++++------- milli/src/update/settings.rs | 26 ++-- 4 files changed, 122 insertions(+), 64 deletions(-) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index f601e2b03..b02c10319 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -680,6 +680,26 @@ async fn search_facet_distribution() { }, ) .await; + + index.update_settings(json!({"filterableAttributes": ["doggos.name"]})).await; + index.wait_task(5).await; + + index + .search( + json!({ + "facets": ["doggos.name"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + let dist = response["facetDistribution"].as_object().unwrap(); + assert_eq!(dist.len(), 1); + assert_eq!( + dist["doggos.name"], + json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) + ); + }, + ) + .await; } #[actix_rt::test] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 881633b5c..c74aa10e8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -354,8 +354,7 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator bool { - field.starts_with(facet) - && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) + field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.') } pub fn normalize_facet(original: &str) -> String { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index aef4d1583..733e74800 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -20,13 +20,13 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{ - del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd, -}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; +use crate::{ + is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, +}; pub struct TransformOutput { pub primary_key: String, @@ -808,11 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> { })?; let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; + let fields_ids_map = self.fields_ids_map; + let primary_key_id = self.index.primary_key(wtxn)?.and_then(|name| fields_ids_map.id(name)); let mut new_inner_settings = old_inner_settings.clone(); - new_inner_settings.fields_ids_map = self.fields_ids_map; + new_inner_settings.fields_ids_map = fields_ids_map; + let settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated: false, settings_update_only: false, }; @@ -837,37 +841,66 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, - original_obkv_buffer: &mut Vec, - flattened_obkv_buffer: &mut Vec, + modified_faceted_fields: &HashSet, + original_obkv_buffer: Option<&mut Vec>, + flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { - // TODO do a XOR of the faceted fields - // TODO if reindex_searchable returns true store all searchables else none - // TODO no longer useful after Tamo's PR - let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); - let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); + // Always keep the primary key. + let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; + + // If only the `searchableAttributes` has been changed, keep only the searchable fields. + let must_reindex_searchables = settings_diff.reindex_searchable(); + let necessary_searchable_field = |id: FieldId| -> bool { + must_reindex_searchables + && (settings_diff.old.searchable_fields_ids.contains(&id) + || settings_diff.new.searchable_fields_ids.contains(&id)) + }; + + // If only a faceted field has been added, keep only this field. + let must_reindex_facets = settings_diff.reindex_facets(); + let necessary_faceted_field = |id: FieldId| -> bool { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + must_reindex_facets + && modified_faceted_fields + .iter() + .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) + }; + + // Alway provide all fields when vectors are involved because + // we need the fields for the prompt/templating. + let reindex_vectors = settings_diff.reindex_vectors(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. - for (id, name) in new_fields_ids_map.iter() { - if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { + for (id, val) in old_obkv.iter() { + if is_primary_key(id) + || necessary_searchable_field(id) + || necessary_faceted_field(id) + || reindex_vectors + { obkv_writer.insert(id, val)?; } } let data = obkv_writer.into_inner()?; - let new_obkv = KvReader::::new(&data); + let obkv = KvReader::::new(&data); - // take the non-flattened version if flatten_from_fields_ids_map returns None. - let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?; - let old_flattened = - old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::::new); - let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?; - let new_flattened = - new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::::new); + if let Some(original_obkv_buffer) = original_obkv_buffer { + original_obkv_buffer.clear(); + into_del_add_obkv(obkv, DelAddOperation::DeletionAndAddition, original_obkv_buffer)?; + } - original_obkv_buffer.clear(); - flattened_obkv_buffer.clear(); + if let Some(flattened_obkv_buffer) = flattened_obkv_buffer { + // take the non-flattened version if flatten_from_fields_ids_map returns None. + let mut fields_ids_map = settings_diff.new.fields_ids_map.clone(); + let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?; + let flattened = flattened.as_deref().map_or(obkv, KvReader::new); - del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; - del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; + flattened_obkv_buffer.clear(); + into_del_add_obkv( + flattened, + DelAddOperation::DeletionAndAddition, + flattened_obkv_buffer, + )?; + } Ok(()) } @@ -924,30 +957,34 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let mut original_obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); - let mut document_sorter_key_buffer = Vec::new(); - for result in self.index.external_documents_ids().iter(wtxn)? { - let (external_id, docid) = result?; - let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; + if original_sorter.is_some() || flattened_sorter.is_some() { + let modified_faceted_fields = settings_diff.modified_faceted_fields(); + let mut original_obkv_buffer = Vec::new(); + let mut flattened_obkv_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + for result in self.index.external_documents_ids().iter(wtxn)? { + let (external_id, docid) = result?; + let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; - Self::rebind_existing_document( - old_obkv, - &settings_diff, - &mut original_obkv_buffer, - &mut flattened_obkv_buffer, - )?; + Self::rebind_existing_document( + old_obkv, + &settings_diff, + &modified_faceted_fields, + Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), + Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), + )?; - document_sorter_key_buffer.clear(); - document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); - document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - if let Some(original_sorter) = original_sorter.as_mut() { - original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; - } - if let Some(flattened_sorter) = flattened_sorter.as_mut() { - flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + if let Some(original_sorter) = original_sorter.as_mut() { + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + } + if let Some(flattened_sorter) = flattened_sorter.as_mut() { + flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1529e1fe6..0fd39ce77 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1067,10 +1067,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage let embedding_configs_updated = self.update_embedding_configs()?; - let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + new_inner_settings.recompute_facets(self.wtxn, self.index)?; + + let primary_key_id = self + .index + .primary_key(self.wtxn)? + .and_then(|name| new_inner_settings.fields_ids_map.id(name)); let inner_settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated, settings_update_only: true, }; @@ -1086,10 +1093,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, - + pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, - pub(crate) settings_update_only: bool, } @@ -1127,15 +1133,7 @@ impl InnerIndexSettingsDiff { return true; } - let faceted_updated = - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields); - - self.old - .fields_ids_map - .iter() - .zip(self.new.fields_ids_map.iter()) - .any(|(old, new)| old != new) - || faceted_updated + (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) } pub fn reindex_vectors(&self) -> bool { @@ -1145,6 +1143,10 @@ impl InnerIndexSettingsDiff { pub fn settings_update_only(&self) -> bool { self.settings_update_only } + + pub fn modified_faceted_fields(&self) -> HashSet { + &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields + } } #[derive(Clone)] From e3407056347571d3938938ecab1968dda0aeb6a4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 23 May 2024 15:29:06 +0200 Subject: [PATCH 51/56] Change benchmark outputs - logs to stderr instead of stdout - prints links to the dashboard when there is a dashboard --- xtask/src/bench/client.rs | 4 ++++ xtask/src/bench/dashboard.rs | 43 ++++++++++++++++++++++++++--------- xtask/src/bench/mod.rs | 44 ++++++++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/xtask/src/bench/client.rs b/xtask/src/bench/client.rs index 3e46615cc..1c2b743af 100644 --- a/xtask/src/bench/client.rs +++ b/xtask/src/bench/client.rs @@ -55,6 +55,10 @@ impl Client { pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { self.request(reqwest::Method::DELETE, route) } + + pub fn base_url(&self) -> Option<&str> { + self.base_url.as_deref() + } } #[derive(Debug, Clone, Copy, Deserialize)] diff --git a/xtask/src/bench/dashboard.rs b/xtask/src/bench/dashboard.rs index 3ba0ca58b..67353f7bb 100644 --- a/xtask/src/bench/dashboard.rs +++ b/xtask/src/bench/dashboard.rs @@ -18,12 +18,9 @@ pub enum DashboardClient { } impl DashboardClient { - pub fn new(dashboard_url: &str, api_key: Option<&str>) -> anyhow::Result { - let dashboard_client = Client::new( - Some(format!("{}/api/v1", dashboard_url)), - api_key, - Some(std::time::Duration::from_secs(60)), - )?; + pub fn new(dashboard_url: String, api_key: Option<&str>) -> anyhow::Result { + let dashboard_client = + Client::new(Some(dashboard_url), api_key, Some(std::time::Duration::from_secs(60)))?; Ok(Self::Client(dashboard_client)) } @@ -36,7 +33,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(()) }; let response = dashboard_client - .put("machine") + .put("/api/v1/machine") .json(&json!({"hostname": env.hostname})) .send() .await @@ -62,7 +59,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; let response = dashboard_client - .put("invocation") + .put("/api/v1/invocation") .json(&json!({ "commit": { "sha1": build_info.commit_sha1, @@ -97,7 +94,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; let response = dashboard_client - .put("workload") + .put("/api/v1/workload") .json(&json!({ "invocation_uuid": invocation_uuid, "name": &workload.name, @@ -124,7 +121,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(()) }; let response = dashboard_client - .put("run") + .put("/api/v1/run") .json(&json!({ "workload_uuid": workload_uuid, "data": report @@ -159,7 +156,7 @@ impl DashboardClient { pub async fn mark_as_failed(&self, invocation_uuid: Uuid, failure_reason: Option) { if let DashboardClient::Client(client) = self { let response = client - .post("cancel-invocation") + .post("/api/v1/cancel-invocation") .json(&json!({ "invocation_uuid": invocation_uuid, "failure_reason": failure_reason, @@ -186,4 +183,28 @@ impl DashboardClient { tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); } + + /// Result URL in markdown + pub(crate) fn result_url( + &self, + workload_name: &str, + build_info: &build_info::BuildInfo, + baseline_branch: &str, + ) -> String { + let Self::Client(client) = self else { return Default::default() }; + let Some(base_url) = client.base_url() else { return Default::default() }; + + let Some(commit_sha1) = build_info.commit_sha1 else { return Default::default() }; + + // https://bench.meilisearch.dev/view_spans?commit_sha1=500ddc76b549fb9f1af54b2dd6abfa15960381bb&workload_name=settings-add-remove-filters.json&target_branch=reduce-transform-disk-usage&baseline_branch=main + let mut url = format!( + "{base_url}/view_spans?commit_sha1={commit_sha1}&workload_name={workload_name}" + ); + + if let Some(target_branch) = build_info.branch { + url += &format!("&target_branch={target_branch}&baseline_branch={baseline_branch}"); + } + + format!("[{workload_name} compared with {baseline_branch}]({url})") + } } diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs index 844b64f63..fdb2c4963 100644 --- a/xtask/src/bench/mod.rs +++ b/xtask/src/bench/mod.rs @@ -6,6 +6,7 @@ mod env_info; mod meili_process; mod workload; +use std::io::LineWriter; use std::path::PathBuf; use anyhow::Context; @@ -90,6 +91,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let subscriber = tracing_subscriber::registry().with( tracing_subscriber::fmt::layer() + .with_writer(|| LineWriter::new(std::io::stderr())) .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) .with_filter(filter), ); @@ -110,7 +112,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let dashboard_client = if args.no_dashboard { dashboard::DashboardClient::new_dry() } else { - dashboard::DashboardClient::new(&args.dashboard_url, args.api_key.as_deref())? + dashboard::DashboardClient::new(args.dashboard_url.clone(), args.api_key.as_deref())? }; // reporting uses its own client because keeping the stream open to wait for entries @@ -136,7 +138,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); - let invocation_uuid = dashboard_client.create_invocation( build_info, commit_message, env, max_workloads, reason).await?; + let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?; tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); @@ -144,6 +146,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let workload_runs = tokio::spawn( { let dashboard_client = dashboard_client.clone(); + let mut dashboard_urls = Vec::new(); async move { for workload_file in args.workload_file.iter() { let workload: Workload = serde_json::from_reader( @@ -152,6 +155,8 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { ) .with_context(|| format!("error parsing {} as JSON", workload_file.display()))?; + let workload_name = workload.name.clone(); + workload::execute( &assets_client, &dashboard_client, @@ -163,8 +168,23 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { &args, ) .await?; + + let result_url = dashboard_client.result_url(&workload_name, &build_info, "main"); + + if !result_url.is_empty() { + dashboard_urls.push(result_url); + } + + if let Some(branch) = build_info.branch { + let result_url = dashboard_client.result_url(&workload_name, &build_info, branch); + + + if !result_url.is_empty() { + dashboard_urls.push(result_url); + } + } } - Ok::<(), anyhow::Error>(()) + Ok::<_, anyhow::Error>(dashboard_urls) }}); // handle ctrl-c @@ -176,13 +196,19 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { // wait for the end of the main task, handle result match workload_runs.await { - Ok(Ok(_)) => { + Ok(Ok(urls)) => { tracing::info!("Success"); + println!("☀️ Benchmark invocation completed, please find the results for your workloads below:"); + for url in urls { + println!("- {url}"); + } Ok::<(), anyhow::Error>(()) } Ok(Err(error)) => { tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); dashboard_client.mark_as_failed(invocation_uuid, Some(error.to_string())).await; + println!("☔️ Benchmark invocation failed..."); + println!("{error}"); tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); Err(error) }, @@ -191,10 +217,20 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { Ok(panic) => { tracing::error!("invocation panicked, attempting to report the failure to dashboard"); dashboard_client.mark_as_failed( invocation_uuid, Some("Panicked".into())).await; + println!("‼️ Benchmark invocation panicked 😱"); + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + println!("panicked at {msg}"); std::panic::resume_unwind(panic) } Err(_) => { tracing::warn!("task was canceled"); + println!("🚫 Benchmark invocation was canceled"); Ok(()) } } From eaf57056cabeab6a776b2aeb7c093b13bea6bdcb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 23 May 2024 15:33:07 +0200 Subject: [PATCH 52/56] comment with the results of benchmarks --- .github/workflows/bench-pr.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 418a23717..36af79460 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -43,4 +43,11 @@ jobs: - name: Run benchmarks on PR ${{ github.event.issue.id }} run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" \ + --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" \ + --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" \ + -- ${{ steps.command.outputs.command-arguments }} > benchlinks.txt + + - name: Send comment in PR + run: | + gh pr comment ${{github.event.issue.number}} --body-file benchlinks.txt From 7f3e51349e2631fed5e67254a4dd35324f89d0db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 27 May 2024 15:53:06 +0200 Subject: [PATCH 53/56] Remove puffin for the dependencies --- Cargo.lock | 26 -------------------------- index-scheduler/Cargo.toml | 1 - meilisearch/Cargo.toml | 1 - milli/Cargo.toml | 3 --- 4 files changed, 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 156917462..008f18a16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2464,7 +2464,6 @@ dependencies = [ "meilisearch-auth", "meilisearch-types", "page_size 0.5.0", - "puffin", "rayon", "roaring", "serde", @@ -3231,12 +3230,6 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" -[[package]] -name = "lz4_flex" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" - [[package]] name = "macro_rules_attribute" version = "0.2.0" @@ -3341,7 +3334,6 @@ dependencies = [ "pin-project-lite", "platform-dirs", "prometheus", - "puffin", "rand", "rayon", "regex", @@ -3509,7 +3501,6 @@ dependencies = [ "obkv", "once_cell", "ordered-float", - "puffin", "rand", "rand_pcg", "rayon", @@ -4180,23 +4171,6 @@ version = "2.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" -[[package]] -name = "puffin" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76425abd4e1a0ad4bd6995dd974b52f414fca9974171df8e3708b3e660d05a21" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "cfg-if", - "instant", - "lz4_flex", - "once_cell", - "parking_lot", - "serde", -] - [[package]] name = "pulp" version = "0.18.9" diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 4b6c0a36d..21fa34733 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -22,7 +22,6 @@ flate2 = "1.0.28" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.5.0" -puffin = { version = "0.16.0", features = ["serialization"] } rayon = "1.8.1" roaring = { version = "0.10.2", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index ed62c5f48..75962c450 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -67,7 +67,6 @@ permissive-json-pointer = { path = "../permissive-json-pointer" } pin-project-lite = "0.2.13" platform-dirs = "0.3.0" prometheus = { version = "0.13.3", features = ["process"] } -puffin = { version = "0.16.0", features = ["serialization"] } rand = "0.8.5" rayon = "1.8.0" regex = "1.10.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c5dddd0fd..4a08e6261 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -67,9 +67,6 @@ filter-parser = { path = "../filter-parser" } # documents words self-join itertools = "0.11.0" -# profiling -puffin = "0.16.0" - csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } From dc949ab46a7bcd60b250f4131c3fd0e4dfa41800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 27 May 2024 15:59:14 +0200 Subject: [PATCH 54/56] Remove puffin usage --- index-scheduler/src/batch.rs | 6 -- index-scheduler/src/lib.rs | 36 ----------- milli/src/update/clear_documents.rs | 2 - milli/src/update/index_documents/enrich.rs | 2 - .../extract/extract_docid_word_positions.rs | 2 - .../extract/extract_facet_number_docids.rs | 2 - .../extract/extract_facet_string_docids.rs | 2 - .../extract/extract_fid_docid_facet_values.rs | 2 - .../extract/extract_fid_word_count_docids.rs | 2 - .../extract/extract_geo_points.rs | 2 - .../extract/extract_vector_points.rs | 4 -- .../extract/extract_word_docids.rs | 4 -- .../extract_word_pair_proximity_docids.rs | 5 -- .../extract/extract_word_position_docids.rs | 4 -- .../src/update/index_documents/extract/mod.rs | 11 +--- .../index_documents/helpers/grenad_helpers.rs | 3 - milli/src/update/index_documents/mod.rs | 17 +---- milli/src/update/index_documents/transform.rs | 8 --- .../src/update/index_documents/typed_chunk.rs | 62 ------------------- milli/src/update/settings.rs | 2 - milli/src/update/word_prefix_docids.rs | 2 - .../src/update/words_prefix_integer_docids.rs | 1 - milli/src/update/words_prefixes_fst.rs | 2 - 23 files changed, 2 insertions(+), 181 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index d10f83a0a..181ac49a3 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -529,8 +529,6 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(crate::tests::FailureLocation::InsideCreateBatch)?; - puffin::profile_function!(); - let enqueued = &self.get_status(rtxn, Status::Enqueued)?; let to_cancel = self.get_kind(rtxn, Kind::TaskCancelation)? & enqueued; @@ -639,8 +637,6 @@ impl IndexScheduler { self.breakpoint(crate::Breakpoint::InsideProcessBatch); } - puffin::profile_function!(batch.to_string()); - match batch { Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => { // 1. Retrieve the tasks that matched the query at enqueue-time. @@ -1226,8 +1222,6 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, ) -> Result> { - puffin::profile_function!(); - match operation { IndexOperation::DocumentClear { mut tasks, .. } => { let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index e4c9cd08f..8a1c2f540 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -33,7 +33,6 @@ pub type Result = std::result::Result; pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; -use std::fs::File; use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; @@ -59,7 +58,6 @@ use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfi use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; -use puffin::FrameView; use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -344,9 +342,6 @@ pub struct IndexScheduler { /// The Authorization header to send to the webhook URL. pub(crate) webhook_authorization_header: Option, - /// A frame to output the indexation profiling files to disk. - pub(crate) puffin_frame: Arc, - /// The path used to create the dumps. pub(crate) dumps_path: PathBuf, @@ -401,7 +396,6 @@ impl IndexScheduler { cleanup_enabled: self.cleanup_enabled, max_number_of_tasks: self.max_number_of_tasks, max_number_of_batched_tasks: self.max_number_of_batched_tasks, - puffin_frame: self.puffin_frame.clone(), snapshots_path: self.snapshots_path.clone(), dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), @@ -500,7 +494,6 @@ impl IndexScheduler { env, // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things wake_up: Arc::new(SignalEvent::auto(true)), - puffin_frame: Arc::new(puffin::GlobalFrameView::default()), autobatching_enabled: options.autobatching_enabled, cleanup_enabled: options.cleanup_enabled, max_number_of_tasks: options.max_number_of_tasks, @@ -621,10 +614,6 @@ impl IndexScheduler { run.wake_up.wait(); loop { - let puffin_enabled = run.features().check_puffin().is_ok(); - puffin::set_scopes_on(puffin_enabled); - puffin::GlobalProfiler::lock().new_frame(); - match run.tick() { Ok(TickOutcome::TickAgain(_)) => (), Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(), @@ -636,31 +625,6 @@ impl IndexScheduler { } } } - - // Let's write the previous frame to disk but only if - // the user wanted to profile with puffin. - if puffin_enabled { - let mut frame_view = run.puffin_frame.lock(); - if !frame_view.is_empty() { - let now = OffsetDateTime::now_utc(); - let mut file = match File::create(format!("{}.puffin", now)) { - Ok(file) => file, - Err(e) => { - tracing::error!("{e}"); - continue; - } - }; - if let Err(e) = frame_view.save_to_writer(&mut file) { - tracing::error!("{e}"); - } - if let Err(e) = file.sync_all() { - tracing::error!("{e}"); - } - // We erase this frame view as it is no more useful. We want to - // measure the new frames now that we exported the previous ones. - *frame_view = FrameView::default(); - } - } } }) .unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6715939dc..3490b55e4 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -21,8 +21,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { name = "clear_documents" )] pub fn execute(self) -> Result { - puffin::profile_function!(); - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let Index { env: _env, diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 162136912..2da717bb0 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -29,8 +29,6 @@ pub fn enrich_documents_batch( autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { - puffin::profile_function!(); - let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index d97b6639e..9c557de81 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -29,8 +29,6 @@ pub fn extract_docid_word_positions( settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, ) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { - puffin::profile_function!(); - let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1848a085f..bfd769604 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -23,8 +23,6 @@ pub fn extract_facet_number_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut facet_number_docids_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index abffe17ab..3deace127 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -28,8 +28,6 @@ pub fn extract_facet_string_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let options = NormalizerOption { lossy: true, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 123c3b123..a2b060255 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -47,8 +47,6 @@ pub fn extract_fid_docid_facet_values( settings_diff: &InnerIndexSettingsDiff, geo_fields_ids: Option<(FieldId, FieldId)>, ) -> Result { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 51e0642da..f252df1cd 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -26,8 +26,6 @@ pub fn extract_fid_word_count_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut fid_word_count_docids_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index cfcc021c6..3d7463fba 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -20,8 +20,6 @@ pub fn extract_geo_points( primary_key_id: FieldId, (lat_fid, lng_fid): (FieldId, FieldId), ) -> Result>> { - puffin::profile_function!(); - let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 724d9ea81..76ec90d65 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -91,8 +91,6 @@ pub fn extract_vector_points( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result> { - puffin::profile_function!(); - let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; @@ -295,7 +293,6 @@ fn push_vectors_diff( delta: VectorStateDelta, reindex_vectors: bool, ) -> Result<()> { - puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. @@ -367,7 +364,6 @@ pub fn extract_embeddings( embedder: Arc, request_threads: &ThreadPoolNoAbort, ) -> Result>> { - puffin::profile_function!(); let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5699f2fb6..457d2359e 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -36,8 +36,6 @@ pub fn extract_word_docids( grenad::Reader>, grenad::Reader>, )> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut word_fid_docids_sorter = create_sorter( @@ -167,8 +165,6 @@ fn words_into_sorter( add_words: &BTreeSet>, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { - puffin::profile_function!(); - use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 23f70ccd2..617338f9f 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -26,7 +26,6 @@ pub fn extract_word_pair_proximity_docids( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; @@ -71,8 +70,6 @@ pub fn extract_word_pair_proximity_docids( // if we change document, we fill the sorter if current_document_id.map_or(false, |id| id != document_id) { - puffin::profile_scope!("Document into sorter"); - // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter"); let _entered = span.enter(); @@ -163,7 +160,6 @@ pub fn extract_word_pair_proximity_docids( } if let Some(document_id) = current_document_id { - puffin::profile_scope!("Final document into sorter"); // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter"); let _entered = span.enter(); @@ -176,7 +172,6 @@ pub fn extract_word_pair_proximity_docids( )?; } { - puffin::profile_scope!("sorter_into_reader"); // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader"); let _entered = span.enter(); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 45a05b0d0..50b1617f9 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -25,8 +25,6 @@ pub fn extract_word_position_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut word_position_docids_sorter = create_sorter( @@ -104,8 +102,6 @@ fn words_position_into_sorter( add_word_positions: &BTreeSet<(u16, Vec)>, word_position_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { - puffin::profile_function!(); - use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 7598c8094..90723bc4a 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -47,8 +47,6 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { - puffin::profile_function!(); - let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { original_obkv_chunks @@ -90,7 +88,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_fid_word_count_docids, TypedChunk::FieldIdWordCountDocids, - "field-id-wordcount-docids", ); run_extraction_task::< _, @@ -117,7 +114,6 @@ pub(crate) fn data_from_obkv_documents( word_fid_docids_reader, } }, - "word-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -127,7 +123,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_word_position_docids, TypedChunk::WordPositionDocids, - "word-position-docids", ); run_extraction_task::< @@ -141,7 +136,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_facet_string_docids, TypedChunk::FieldIdFacetStringDocids, - "field-id-facet-string-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -151,7 +145,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_facet_number_docids, TypedChunk::FieldIdFacetNumberDocids, - "field-id-facet-number-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -161,7 +154,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", ); } @@ -185,7 +177,6 @@ fn run_extraction_task( lmdb_writer_sx: Sender>, extract_fn: FE, serialize_fn: FS, - name: &'static str, ) where FE: Fn( grenad::Reader, @@ -203,7 +194,7 @@ fn run_extraction_task( rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); - puffin::profile_scope!("extract_multiple_chunks", name); + match extract_fn(chunk, indexer, &settings_diff) { Ok(chunk) => { let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index b0e3654a9..aa574024d 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -61,7 +61,6 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result>> { - puffin::profile_function!(); let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -182,8 +181,6 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(); - let mut buffer = Vec::new(); let database = database.remap_types::(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index dccfbe795..f281becd6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -141,8 +141,6 @@ where mut self, reader: DocumentsBatchReader, ) -> Result<(Self, StdResult)> { - puffin::profile_function!(); - // Early return when there is no document to add if reader.is_empty() { return Ok((self, Ok(0))); @@ -187,8 +185,6 @@ where mut self, to_delete: Vec, ) -> Result<(Self, StdResult)> { - puffin::profile_function!(); - // Early return when there is no document to add if to_delete.is_empty() { // Maintains Invariant: remove documents actually always returns Ok for the inner result @@ -223,8 +219,6 @@ where mut self, to_delete: &RoaringBitmap, ) -> Result<(Self, u64)> { - puffin::profile_function!(); - // Early return when there is no document to add if to_delete.is_empty() { return Ok((self, 0)); @@ -249,8 +243,6 @@ where name = "index_documents" )] pub fn execute(mut self) -> Result { - puffin::profile_function!(); - if self.added_documents == 0 && self.deleted_documents == 0 { let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); @@ -279,8 +271,6 @@ where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let TransformOutput { primary_key, mut settings_diff, @@ -404,7 +394,7 @@ where rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); - puffin::profile_scope!("extract_and_send_grenad_chunks"); + // split obkv file into several chunks let original_chunk_iter = match original_documents { Some(original_documents) => { @@ -612,8 +602,6 @@ where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; @@ -657,7 +645,6 @@ where { let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs"); let _entered = span.enter(); - puffin::profile_scope!("compute_prefix_diffs"); current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; @@ -797,8 +784,6 @@ fn execute_word_prefix_docids( common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); - let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); builder.chunk_compression_type = indexer_config.chunk_compression_type; builder.chunk_compression_level = indexer_config.chunk_compression_level; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 733e74800..41a0a55cf 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -161,8 +161,6 @@ impl<'a, 'i> Transform<'a, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; @@ -375,8 +373,6 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // there may be duplicates in the documents to remove. to_remove.sort_unstable(); to_remove.dedup(); @@ -466,8 +462,6 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let mut documents_deleted = 0; let mut document_sorter_value_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -686,8 +680,6 @@ impl<'a, 'i> Transform<'a, 'i> { where F: Fn(UpdateIndexingStep) + Sync, { - puffin::profile_function!(); - let primary_key = self .index .primary_key(wtxn)? diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2345551ab..27f760c2a 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -118,65 +118,6 @@ impl TypedChunk { } } -impl TypedChunk { - pub fn to_debug_string(&self) -> String { - match self { - TypedChunk::FieldIdDocidFacetStrings(grenad) => { - format!("FieldIdDocidFacetStrings {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdDocidFacetNumbers(grenad) => { - format!("FieldIdDocidFacetNumbers {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::Documents(grenad) => { - format!("Documents {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdWordCountDocids(grenad) => { - format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => format!( - "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", - word_docids_reader.len(), - exact_word_docids_reader.len(), - word_fid_docids_reader.len() - ), - TypedChunk::WordPositionDocids(grenad) => { - format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordPairProximityDocids(grenad) => { - format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetStringDocids((grenad, _)) => { - format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetNumberDocids(grenad) => { - format!("FieldIdFacetNumberDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetExistsDocids(grenad) => { - format!("FieldIdFacetExistsDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetIsNullDocids(grenad) => { - format!("FieldIdFacetIsNullDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetIsEmptyDocids(grenad) => { - format!("FieldIdFacetIsEmptyDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::GeoPoints(grenad) => { - format!("GeoPoints {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension, embedder_name } => { - format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {}, embedder_name: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension, embedder_name) - } - TypedChunk::ScriptLanguageDocids(sl_map) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) - } - } - } -} - /// Write typed chunk in the corresponding LMDB database of the provided index. /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] @@ -185,8 +126,6 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, wtxn: &mut RwTxn, ) -> Result<(RoaringBitmap, bool)> { - puffin::profile_function!(typed_chunks[0].to_debug_string()); - let mut is_merged_database = false; match typed_chunks[0] { TypedChunk::Documents(_) => { @@ -877,7 +816,6 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(); let mut buffer = Vec::new(); let database = database.remap_types::(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0fd39ce77..133f0e3a8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -398,8 +398,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(self.wtxn)? == 0 { diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 1db066058..925635f80 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -52,8 +52,6 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); - // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index 272d465fd..9b6aa21ae 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -57,7 +57,6 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); debug!("Computing and writing the word levels integers docids into LMDB on disk..."); let mut prefix_integer_docids_sorter = create_sorter( diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 8b438cef3..d47d6d14c 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -45,8 +45,6 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { name = "words_prefix_fst" )] pub fn execute(self) -> Result<()> { - puffin::profile_function!(); - let words_fst = self.index.words_fst(self.wtxn)?; let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; From b6d450d4842e863792f4324090fa16edeb652c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 27 May 2024 15:59:28 +0200 Subject: [PATCH 55/56] Remove puffin experimental feature --- index-scheduler/src/features.rs | 13 ------------- meilisearch-types/src/features.rs | 1 - meilisearch/src/routes/features.rs | 15 ++------------- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index 3be18a3f1..ae8e6728a 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -68,19 +68,6 @@ impl RoFeatures { .into()) } } - - pub fn check_puffin(&self) -> Result<()> { - if self.runtime.export_puffin_reports { - Ok(()) - } else { - Err(FeatureNotEnabledError { - disabled_action: "Outputting Puffin reports to disk", - feature: "export puffin reports", - issue_link: "https://github.com/meilisearch/product/discussions/693", - } - .into()) - } - } } impl FeatureData { diff --git a/meilisearch-types/src/features.rs b/meilisearch-types/src/features.rs index 04a5d9d6f..dda9dee51 100644 --- a/meilisearch-types/src/features.rs +++ b/meilisearch-types/src/features.rs @@ -6,7 +6,6 @@ pub struct RuntimeTogglableFeatures { pub vector_store: bool, pub metrics: bool, pub logs_route: bool, - pub export_puffin_reports: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 227b485c5..0e02309fa 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -47,8 +47,6 @@ pub struct RuntimeTogglableFeatures { pub metrics: Option, #[deserr(default)] pub logs_route: Option, - #[deserr(default)] - pub export_puffin_reports: Option, } async fn patch_features( @@ -68,21 +66,13 @@ async fn patch_features( vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store), metrics: new_features.0.metrics.unwrap_or(old_features.metrics), logs_route: new_features.0.logs_route.unwrap_or(old_features.logs_route), - export_puffin_reports: new_features - .0 - .export_puffin_reports - .unwrap_or(old_features.export_puffin_reports), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because // the it renames to camelCase, which we don't want for analytics. // **Do not** ignore fields with `..` or `_` here, because we want to add them in the future. - let meilisearch_types::features::RuntimeTogglableFeatures { - vector_store, - metrics, - logs_route, - export_puffin_reports, - } = new_features; + let meilisearch_types::features::RuntimeTogglableFeatures { vector_store, metrics, logs_route } = + new_features; analytics.publish( "Experimental features Updated".to_string(), @@ -90,7 +80,6 @@ async fn patch_features( "vector_store": vector_store, "metrics": metrics, "logs_route": logs_route, - "export_puffin_reports": export_puffin_reports, }), Some(&req), ); From 487431a03538dc4132f8cfcff05959d4fc5e79c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 27 May 2024 16:12:20 +0200 Subject: [PATCH 56/56] Fix tests --- index-scheduler/src/insta_snapshot.rs | 1 - meilisearch/tests/dumps/mod.rs | 3 +-- meilisearch/tests/features/mod.rs | 20 +++++++------------- meilisearch/tests/search/hybrid.rs | 6 ++---- meilisearch/tests/settings/get_settings.rs | 3 +-- 5 files changed, 11 insertions(+), 22 deletions(-) diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 988e75b81..d8625a2c7 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -32,7 +32,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { features: _, max_number_of_tasks: _, max_number_of_batched_tasks: _, - puffin_frame: _, wake_up: _, dumps_path: _, snapshots_path: _, diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 1a31437f8..c8f8ca105 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1859,8 +1859,7 @@ async fn import_dump_v6_containing_experimental_features() { { "vectorStore": false, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); diff --git a/meilisearch/tests/features/mod.rs b/meilisearch/tests/features/mod.rs index 3a9812f30..9548567ff 100644 --- a/meilisearch/tests/features/mod.rs +++ b/meilisearch/tests/features/mod.rs @@ -20,8 +20,7 @@ async fn experimental_features() { { "vectorStore": false, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -32,8 +31,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -44,8 +42,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -57,8 +54,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -70,8 +66,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); } @@ -90,8 +85,7 @@ async fn experimental_feature_metrics() { { "vectorStore": false, "metrics": true, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -146,7 +140,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`, `exportPuffinReports`", + "message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 028b341cb..9c50df6e1 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -18,8 +18,7 @@ async fn index_with_documents_user_provided<'a>( { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -47,8 +46,7 @@ async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> I { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index cd31d4959..379e0a917 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -98,8 +98,7 @@ async fn secrets_are_hidden_in_settings() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###);