diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 418a23717..36af79460 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -43,4 +43,11 @@ jobs: - name: Run benchmarks on PR ${{ github.event.issue.id }} run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" \ + --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" \ + --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" \ + -- ${{ steps.command.outputs.command-arguments }} > benchlinks.txt + + - name: Send comment in PR + run: | + gh pr comment ${{github.event.issue.number}} --body-file benchlinks.txt diff --git a/BENCHMARKS.md b/BENCHMARKS.md index e588b1b5b..e1d0c5feb 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -187,8 +187,8 @@ They are JSON files with the following structure (comments are not actually supp }, // Core of the workload. // A list of commands to run sequentially. - // A command is a request to the Meilisearch instance that is executed while the profiling runs. - "commands": [ + // Optional: A precommand is a request to the Meilisearch instance that is executed before the profiling runs. + "precommands": [ { // Meilisearch route to call. `http://localhost:7700/` will be prepended. "route": "indexes/movies/settings", @@ -224,8 +224,11 @@ They are JSON files with the following structure (comments are not actually supp // - DontWait: run the next command without waiting the response to this one. // - WaitForResponse: run the next command as soon as the response from the server is received. // - WaitForTask: run the next command once **all** the Meilisearch tasks created up to now have finished processing. - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + // A command is a request to the Meilisearch instance that is executed while the profiling runs. + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/Cargo.lock b/Cargo.lock index ab060ac93..966394cf6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,7 +80,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.2.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efddeb1e7c32a551cc07ef4c3e181e3cd5478fdaf4f0bd799983171c1f6efe57" +checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" dependencies = [ "bytemuck", "byteorder", @@ -424,7 +424,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -435,7 +435,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -486,6 +486,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64ct" version = "1.6.0" @@ -552,7 +558,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -685,7 +691,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -845,9 +851,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.90" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" dependencies = [ "jobserver", "libc", @@ -992,7 +998,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1304,7 +1310,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1326,7 +1332,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core 0.20.3", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1356,7 +1362,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1460,7 +1466,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1530,9 +1536,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "doxygen-rs" -version = "0.2.2" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" +checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9" dependencies = [ "phf", ] @@ -1678,7 +1684,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1698,7 +1704,7 @@ checksum = "03cdc46ec28bd728e67540c528013c6a10eb69a02eb31078a1bda695438cbfb8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -1781,7 +1787,7 @@ dependencies = [ "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "uuid", ] @@ -1913,7 +1919,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -2117,9 +2123,9 @@ checksum = "36d244a08113319b5ebcabad2b8b7925732d15eec46d7e7ac3c11734f3b7a6ad" [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "js-sys", @@ -2256,12 +2262,11 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.0-alpha.9" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" +checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" dependencies = [ "bitflags 2.5.0", - "bytemuck", "byteorder", "heed-traits", "heed-types", @@ -2275,15 +2280,15 @@ dependencies = [ [[package]] name = "heed-traits" -version = "0.20.0-alpha.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" +checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" [[package]] name = "heed-types" -version = "0.20.0-alpha.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" +checksum = "3cb0d6ba3700c9a57e83c013693e3eddb68a6d9b6781cacafc62a0d992e8ddb3" dependencies = [ "bincode", "byteorder", @@ -2413,7 +2418,7 @@ dependencies = [ "futures-util", "http 0.2.11", "hyper", - "rustls 0.21.10", + "rustls 0.21.12", "tokio", "tokio-rustls", ] @@ -2459,7 +2464,6 @@ dependencies = [ "meilisearch-auth", "meilisearch-types", "page_size 0.5.0", - "puffin", "rayon", "roaring", "serde", @@ -3157,7 +3161,7 @@ checksum = "fc2fb41a9bb4257a3803154bdf7e2df7d45197d1941c9b1a90ad815231630721" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3183,14 +3187,13 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" +checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" dependencies = [ "cc", "doxygen-rs", "libc", - "pkg-config", ] [[package]] @@ -3213,9 +3216,9 @@ checksum = "e34f76eb3611940e0e7d53a9aaa4e6a3151f69541a282fd0dad5571420c53ff1" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -3227,12 +3230,6 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" -[[package]] -name = "lz4_flex" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" - [[package]] name = "macro_rules_attribute" version = "0.2.0" @@ -3258,7 +3255,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3337,12 +3334,11 @@ dependencies = [ "pin-project-lite", "platform-dirs", "prometheus", - "puffin", "rand", "rayon", "regex", "reqwest", - "rustls 0.21.10", + "rustls 0.21.12", "rustls-pemfile", "segment", "serde", @@ -3505,7 +3501,6 @@ dependencies = [ "obkv", "once_cell", "ordered-float", - "puffin", "rand", "rand_pcg", "rayon", @@ -3598,7 +3593,7 @@ checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3942,7 +3937,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -3996,7 +3991,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4025,7 +4020,7 @@ checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4133,9 +4128,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ "unicode-ident", ] @@ -4176,23 +4171,6 @@ version = "2.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" -[[package]] -name = "puffin" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76425abd4e1a0ad4bd6995dd974b52f414fca9974171df8e3708b3e660d05a21" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "cfg-if", - "instant", - "lz4_flex", - "once_cell", - "parking_lot", - "serde", -] - [[package]] name = "pulp" version = "0.18.9" @@ -4207,9 +4185,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -4391,7 +4369,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.10", + "rustls 0.21.12", "rustls-pemfile", "serde", "serde_json", @@ -4505,9 +4483,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.10" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", @@ -4517,9 +4495,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", "ring", @@ -4540,9 +4518,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" +checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" @@ -4643,9 +4621,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.197" +version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc" dependencies = [ "serde_derive", ] @@ -4661,20 +4639,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "indexmap", "itoa", @@ -4941,7 +4919,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -4963,9 +4941,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.58" +version = "2.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" dependencies = [ "proc-macro2", "quote", @@ -4989,7 +4967,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5100,7 +5078,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5243,7 +5221,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5252,7 +5230,7 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.10", + "rustls 0.21.12", "tokio", ] @@ -5354,7 +5332,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -5469,9 +5447,9 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-blocks" @@ -5529,15 +5507,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.9.6" +version = "2.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" +checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "flate2", "log", "once_cell", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pki-types", "rustls-webpki 0.102.2", "serde", @@ -5703,7 +5681,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "wasm-bindgen-shared", ] @@ -5737,7 +5715,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5834,7 +5812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5843,7 +5821,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5870,7 +5848,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -5905,17 +5883,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -5932,9 +5910,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" @@ -5950,9 +5928,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" @@ -5968,9 +5946,9 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" @@ -5986,9 +5964,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" @@ -6004,9 +5982,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" @@ -6022,9 +6000,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" @@ -6040,9 +6018,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" [[package]] name = "winnow" @@ -6140,7 +6118,7 @@ checksum = "9e6936f0cce458098a201c245a11bef556c6a0181129c7034d10d76d1ec3a2b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "synstructure", ] @@ -6161,7 +6139,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", ] [[package]] @@ -6181,7 +6159,7 @@ checksum = "e6a647510471d372f2e6c2e6b7219e44d8c574d24fdc11c610a61455782f18c3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.60", "synstructure", ] diff --git a/dump/src/reader/mod.rs b/dump/src/reader/mod.rs index 5bbf4ec4d..2b3732164 100644 --- a/dump/src/reader/mod.rs +++ b/dump/src/reader/mod.rs @@ -197,6 +197,140 @@ pub(crate) mod test { use super::*; use crate::reader::v6::RuntimeTogglableFeatures; + #[test] + fn import_dump_v6_with_vectors() { + // dump containing two indexes + // + // "vector", configured with an embedder + // contains: + // - one document with an overriden vector, + // - one document with a natural vector + // - one document with a _vectors map containing one additional embedder name and a natural vector + // - one document with a _vectors map containing one additional embedder name and an overriden vector + // + // "novector", no embedder + // contains: + // - a document without vector + // - a document with a random _vectors field + let dump = File::open("tests/assets/v6-with-vectors.dump").unwrap(); + let mut dump = DumpReader::open(dump).unwrap(); + + // top level infos + insta::assert_display_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00"); + insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + + // tasks + let tasks = dump.tasks().unwrap().collect::>>().unwrap(); + let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); + meili_snap::snapshot_hash!(meili_snap::json_string!(tasks), @"278f63325ef06ca04d01df98d8207b94"); + assert_eq!(update_files.len(), 10); + assert!(update_files[0].is_none()); // the dump creation + assert!(update_files[1].is_none()); + assert!(update_files[2].is_none()); + assert!(update_files[3].is_none()); + assert!(update_files[4].is_none()); + assert!(update_files[5].is_none()); + assert!(update_files[6].is_none()); + assert!(update_files[7].is_none()); + assert!(update_files[8].is_none()); + assert!(update_files[9].is_none()); + + // indexes + let mut indexes = dump.indexes().unwrap().collect::>>().unwrap(); + // the index are not ordered in any way by default + indexes.sort_by_key(|index| index.metadata().uid.to_string()); + + let mut vector_index = indexes.pop().unwrap(); + let mut novector_index = indexes.pop().unwrap(); + assert!(indexes.is_empty()); + + // vector + + insta::assert_json_snapshot!(vector_index.metadata(), @r###" + { + "uid": "vector", + "primaryKey": "id", + "createdAt": "2024-05-16T15:33:17.240962Z", + "updatedAt": "2024-05-16T15:40:55.723052Z" + } + "###); + + { + let documents: Result> = vector_index.documents().unwrap().collect(); + let mut documents = documents.unwrap(); + assert_eq!(documents.len(), 4); + + documents.sort_by_key(|doc| doc.get("id").unwrap().to_string()); + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document); + } + } + + // novector + + insta::assert_json_snapshot!(novector_index.metadata(), @r###" + { + "uid": "novector", + "primaryKey": "id", + "createdAt": "2024-05-16T15:33:03.568055Z", + "updatedAt": "2024-05-16T15:33:07.530217Z" + } + "###); + + insta::assert_json_snapshot!(novector_index.settings().unwrap().embedders, @"null"); + + { + let documents: Result> = novector_index.documents().unwrap().collect(); + let mut documents = documents.unwrap(); + assert_eq!(documents.len(), 2); + + documents.sort_by_key(|doc| doc.get("id").unwrap().to_string()); + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document, @r###" + { + "id": "e1", + "other": "random1", + "_vectors": "toto" + } + "###); + } + + { + let document = documents.pop().unwrap(); + insta::assert_json_snapshot!(document, @r###" + { + "id": "e0", + "other": "random0" + } + "###); + } + } + + assert_eq!( + dump.features().unwrap().unwrap(), + RuntimeTogglableFeatures { vector_store: true, ..Default::default() } + ); + } + #[test] fn import_dump_v6_experimental() { let dump = File::open("tests/assets/v6-with-experimental.dump").unwrap(); diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap new file mode 100644 index 000000000..43bdb9726 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap @@ -0,0 +1,783 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e3", + "desc": "overriden vector + map", + "_vectors": { + "default": [ + 0.2, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1 + ], + "toto": [ + 0.1 + ] + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap new file mode 100644 index 000000000..0aad0ea97 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -0,0 +1,786 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e2", + "desc": "natural vector + map", + "_vectors": { + "toto": [], + "default": { + "embeddings": [ + [ + -0.05189208313822746, + -0.9273212552070618, + 0.1443813145160675, + 0.0932632014155388, + 0.2665371894836426, + 0.36266782879829407, + 0.6402910947799683, + 0.32014018297195435, + 0.030915971845388412, + -0.9312191605567932, + -0.3718109726905823, + -0.2700554132461548, + -1.1014580726623535, + 0.9154956936836244, + -0.3406888246536255, + 1.0077725648880005, + 0.6577560901641846, + -0.3955195546150207, + -0.4148270785808563, + 0.1855088472366333, + 0.5062315464019775, + -0.3632686734199524, + -0.2277890294790268, + 0.2560805082321167, + -0.3853609561920166, + -0.1604762226343155, + -0.13947471976280212, + -0.20147813856601715, + -0.4466346800327301, + -0.3761846721172333, + 0.1443382054567337, + 0.18205296993255615, + 0.49359792470932007, + -0.22538000345230105, + -0.4996317625045776, + -0.22734887897968292, + -0.6034309267997742, + -0.7857939600944519, + -0.34923747181892395, + -0.3466345965862274, + 0.21176661550998688, + -0.5101462006568909, + -0.3403083384037018, + 0.000315118464641273, + 0.236465722322464, + -0.10246097296476364, + -1.3013339042663574, + 0.3419138789176941, + -0.32963496446609497, + -0.0901619717478752, + -0.5426247119903564, + 0.22656650841236117, + -0.44758284091949463, + 0.14151698350906372, + -0.1089438870549202, + 0.5500766634941101, + -0.670711100101471, + -0.6227269768714905, + 0.3894464075565338, + -0.27609574794769287, + 0.7028202414512634, + -0.19697771966457367, + 0.328511506319046, + 0.5063360929489136, + 0.4065195322036743, + 0.2614171802997589, + -0.30274391174316406, + 1.0393824577331543, + -0.7742937207221985, + -0.7874112129211426, + -0.6749666929244995, + 0.5190866589546204, + 0.004123548045754433, + -0.28312963247299194, + -0.038731709122657776, + -1.0142987966537476, + -0.09519586712121964, + 0.8755272626876831, + 0.4876938760280609, + 0.7811151742935181, + 0.85174959897995, + 0.11826585978269576, + 0.5373436808586121, + 0.3649002015590668, + 0.19064077734947205, + -0.00287026260048151, + -0.7305403351783752, + -0.015206154435873032, + -0.7899249196052551, + 0.19407285749912265, + 0.08596625179052353, + -0.28976231813430786, + -0.1525907665491104, + 0.3798313438892365, + 0.050306469202041626, + -0.5697937607765198, + 0.4219021201133728, + 0.276252806186676, + 0.1559903472661972, + 0.10030482709407806, + -0.4043720066547394, + -0.1969818025827408, + 0.5739826560020447, + 0.2116064727306366, + -1.4620544910430908, + -0.7802462577819824, + -0.24739810824394223, + -0.09791352599859238, + -0.4413802027702331, + 0.21549351513385773, + -0.9520436525344848, + -0.08762510865926743, + 0.08154498040676117, + -0.6154940724372864, + -1.01079523563385, + 0.885427713394165, + 0.6967288851737976, + 0.27186504006385803, + -0.43194177746772766, + -0.11248451471328735, + 0.7576630711555481, + 0.4998855590820313, + 0.0264343973249197, + 0.9872855544090272, + 0.5634694695472717, + 0.053698331117630005, + 0.19410227239131927, + 0.3570743501186371, + -0.23670297861099243, + -0.9114483594894408, + 0.07884842902421951, + 0.7318344116210938, + 0.44630110263824463, + 0.08745364099740982, + -0.347101628780365, + -0.4314247667789459, + -0.5060274004936218, + 0.003706763498485088, + 0.44320008158683777, + -0.00788921769708395, + -0.1368623524904251, + -0.17391923069953918, + 0.14473655819892883, + 0.10927865654230118, + 0.6974599361419678, + 0.005052129738032818, + -0.016953065991401672, + -0.1256176233291626, + -0.036742497235536575, + 0.5591985583305359, + -0.37619709968566895, + 0.22429119050502777, + 0.5403043031692505, + -0.8603790998458862, + -0.3456307053565979, + 0.9292937517166138, + 0.5074859261512756, + 0.6310645937919617, + -0.3091641068458557, + 0.46902573108673096, + 0.7891915440559387, + 0.4499550759792328, + 0.2744995653629303, + 0.2712305784225464, + -0.04349074140191078, + -0.3638863265514374, + 0.7839881777763367, + 0.7352104783058167, + -0.19457511603832245, + -0.5957832932472229, + -0.43704694509506226, + -1.084769368171692, + 0.4904985725879669, + 0.5385226011276245, + 0.1891629993915558, + 0.12338479608297348, + 0.8315675258636475, + -0.07830192148685455, + 1.0916285514831543, + -0.28066861629486084, + -1.3585069179534912, + 0.5203898549079895, + 0.08678033947944641, + -0.2566044330596924, + 0.09484415501356123, + -0.0180208683013916, + 1.0264745950698853, + -0.023572135716676712, + 0.5864979028701782, + 0.7625196576118469, + -0.2543414533138275, + -0.8877770900726318, + 0.7611982822418213, + -0.06220436468720436, + 0.937336564064026, + 0.2704363465309143, + -0.37733694911003113, + 0.5076137781143188, + -0.30641937255859375, + 0.6252772808074951, + -0.0823579877614975, + -0.03736555948853493, + 0.4131673276424408, + -0.6514252424240112, + 0.12918265163898468, + -0.4483584463596344, + 0.6750786304473877, + -0.37008383870124817, + -0.02324833907186985, + 0.38027650117874146, + -0.26374951004981995, + 0.4346931278705597, + 0.42882832884788513, + -0.48798441886901855, + 1.1882442235946655, + 0.5132288336753845, + 0.5284568667411804, + -0.03538886830210686, + 0.29620853066444397, + -1.0683696269989014, + 0.25936177372932434, + 0.10404160618782043, + -0.25796034932136536, + 0.027896970510482788, + -0.09225251525640488, + 1.4811025857925415, + 0.641173779964447, + -0.13838383555412292, + -0.3437179923057556, + 0.5667019486427307, + -0.5400741696357727, + 0.31090837717056274, + 0.6470608115196228, + -0.3747067153453827, + -0.7364534735679626, + -0.07431528717279434, + 0.5173454880714417, + -0.6578747034072876, + 0.7107478976249695, + -0.7918999791145325, + -0.0648345872759819, + 0.609937846660614, + -0.7329513430595398, + 0.9741371870040894, + 0.17912346124649048, + -0.02658769302070141, + 0.5162150859832764, + -0.3978803157806397, + -0.7833885550498962, + -0.6497276425361633, + -0.3898126780986786, + -0.0952848568558693, + 0.2663288116455078, + -0.1604052186012268, + 0.373076468706131, + -0.8357769250869751, + -0.05217683315277099, + -0.2680160701274872, + 0.8389158248901367, + 0.6833611130714417, + -0.6712407469749451, + 0.7406917214393616, + -0.44522786140441895, + -0.34645363688468933, + -0.27384576201438904, + -0.9878405928611756, + -0.8166060447692871, + 0.06268279999494553, + 0.38567957282066345, + -0.3274703919887543, + 0.5296315550804138, + -0.11810623109340668, + 0.23029841482639313, + 0.08616159111261368, + -0.2195747196674347, + 0.09430307894945145, + 0.4057176411151886, + 0.4892159104347229, + -0.1636916548013687, + -0.6071445345878601, + 0.41256585717201233, + 0.622254490852356, + -0.41223976016044617, + -0.6686707139015198, + -0.7474371790885925, + -0.8509522080421448, + -0.16754287481307983, + -0.9078601002693176, + -0.29653599858283997, + -0.5020652413368225, + 0.4692700505256653, + 0.01281109917908907, + -0.16071580350399017, + 0.03388889133930206, + -0.020511148497462273, + 0.5027827024459839, + -0.20729811489582065, + 0.48107290267944336, + 0.33669769763946533, + -0.5275911688804626, + 0.48271527886390686, + 0.2738940715789795, + -0.033152539283037186, + -0.13629786670207977, + -0.05965912342071533, + -0.26200807094573975, + 0.04002794995903969, + -0.34095603227615356, + -3.986898899078369, + -0.46819332242012024, + -0.422744482755661, + -0.169097900390625, + 0.6008929014205933, + 0.058016058057546616, + -0.11401277780532836, + -0.3077819049358368, + -0.09595538675785063, + 0.6723822355270386, + 0.19367831945419312, + 0.28304359316825867, + 0.1609862744808197, + 0.7567598819732666, + 0.6889985799789429, + 0.06907720118761063, + -0.04188092052936554, + -0.7434936165809631, + 0.13321782648563385, + 0.8456063270568848, + -0.10364038497209548, + -0.45084846019744873, + -0.4758241474628449, + 0.43882066011428833, + -0.6432598829269409, + 0.7217311859130859, + -0.24189773201942444, + 0.12737572193145752, + -1.1008601188659668, + -0.3305315673351288, + 0.14614742994308472, + -0.7819333076477051, + 0.5287120342254639, + -0.055538054555654526, + 0.1877404749393463, + -0.6907662153244019, + 0.5616975426673889, + -0.4611121714115143, + -0.26109233498573303, + -0.12898315489292145, + -0.3724522292613983, + -0.7191406488418579, + -0.4425233602523804, + -0.644108235836029, + 0.8424481153488159, + 0.17532426118850708, + -0.5121750235557556, + -0.6467239260673523, + -0.0008507720194756985, + 0.7866212129592896, + -0.02644744887948036, + -0.005045140627771616, + 0.015782782807946205, + 0.16334445774555206, + -0.1913367658853531, + -0.13697923719882965, + -0.6684983372688293, + 0.18346354365348816, + -0.341105580329895, + 0.5427411198616028, + 0.3779832422733307, + -0.6778115034103394, + -0.2931850254535675, + -0.8805161714553833, + -0.4212774932384491, + -0.5368952751159668, + -1.3937891721725464, + -1.225494146347046, + 0.4276703894138336, + 1.1205668449401855, + -0.6005299687385559, + 0.15732505917549133, + -0.3914784789085388, + -1.357046604156494, + -0.4707142114639282, + -0.1497287154197693, + -0.25035548210144043, + -0.34328439831733704, + 0.39083412289619446, + 0.1623048633337021, + -0.9275814294815063, + -0.6430015563964844, + 0.2973862886428833, + 0.5580436587333679, + -0.6232585310935974, + -0.6611042022705078, + 0.4015969038009643, + -1.0232892036437988, + -0.2585645020008087, + -0.5431421399116516, + 0.5021264553070068, + -0.48601630330085754, + -0.010242084041237833, + 0.5862035155296326, + 0.7316920161247253, + 0.4036808013916016, + 0.4269520044326782, + -0.705938458442688, + 0.7747307419776917, + 0.10164368897676468, + 0.7887958884239197, + -0.9612497091293336, + 0.12755516171455383, + 0.06812842190265656, + -0.022603651508688927, + 0.14722754061222076, + -0.5588505268096924, + -0.20689940452575684, + 0.3557641804218292, + -0.6812759637832642, + 0.2860803008079529, + -0.38954633474349976, + 0.1759403496980667, + -0.5678874850273132, + -0.1692986786365509, + -0.14578519761562347, + 0.5711379051208496, + 1.0208125114440918, + 0.7759483456611633, + -0.372348427772522, + -0.5460885763168335, + 0.7190321683883667, + -0.6914990544319153, + 0.13365162909030914, + -0.4854792356491089, + 0.4054908752441406, + 0.4502798914909363, + -0.3041122555732727, + -0.06726965308189392, + -0.05570871382951737, + -0.0455719493329525, + 0.4785125255584717, + 0.8867972493171692, + 0.4107886850833893, + 0.6121342182159424, + -0.20477132499217987, + -0.5598517656326294, + -0.6443566679954529, + -0.5905212759971619, + -0.5571200251579285, + 0.17573799192905426, + -0.28621870279312134, + 0.1685224026441574, + 0.09719007462263109, + -0.04223639518022537, + -0.28623101115226746, + -0.1449810117483139, + -0.3789580464363098, + -0.5227636098861694, + -0.049728814512491226, + 0.7849089503288269, + 0.16792525351047516, + 0.9849340915679932, + -0.6559549570083618, + 0.35723909735679626, + -0.6822739243507385, + 1.2873116731643677, + 0.19993330538272855, + 0.03512010723352432, + -0.6972134113311768, + 0.18453484773635864, + -0.2437680810689926, + 0.2156416028738022, + 0.5230382680892944, + 0.22020135819911957, + 0.8314080238342285, + 0.15627102553844452, + -0.7330264449119568, + 0.3888184726238251, + -0.22034703195095065, + 0.5457669496536255, + -0.48084837198257446, + -0.45576658844947815, + -0.09287727624177931, + -0.06968110054731369, + 0.35125672817230225, + -0.4278119504451752, + 0.2038476765155792, + 0.11392722278833388, + 0.9433983564376832, + -0.4097744226455689, + 0.035297419875860214, + -0.4274404048919678, + -0.25100165605545044, + 1.0943366289138794, + -0.07634022831916809, + -0.2925529479980469, + -0.7512530088424683, + 0.2649727463722229, + -0.4078235328197479, + -0.3372223973274231, + 0.05190162733197212, + 0.005654910113662481, + -0.0001571219472680241, + -0.35445958375930786, + -0.7837416529655457, + 0.1500556766986847, + 0.4383024573326111, + 0.6099548935890198, + 0.05951934307813645, + -0.21325334906578064, + 0.0199207104742527, + -0.22704418003559113, + -0.6481077671051025, + 0.37442275881767273, + -1.015955924987793, + 0.38637226819992065, + -0.06489371508359909, + -0.494120329618454, + 0.3469836115837097, + 0.15402406454086304, + -0.7660972476005554, + -0.7053225040435791, + -0.25964751839637756, + 0.014004424214363098, + -0.2860170006752014, + -0.17565494775772095, + -0.45117494463920593, + -0.0031954257283359766, + 0.09676837921142578, + -0.514464259147644, + 0.41698193550109863, + -0.21642713248729703, + -0.5398141145706177, + -0.3647628426551819, + 0.37005379796028137, + 0.239425927400589, + -0.08833975344896317, + 0.934946596622467, + -0.48340797424316406, + 0.6241437792778015, + -0.7253676652908325, + -0.04303571209311485, + 1.1125205755233765, + -0.15692919492721558, + -0.2914651036262512, + -0.5117168426513672, + 0.21365483105182648, + 0.4924402534961701, + 0.5269662141799927, + 0.0352792888879776, + -0.149167999625206, + -0.6019760370254517, + 0.08245442807674408, + 0.4900692105293274, + 0.518824577331543, + -0.00005570516441366635, + -0.553304135799408, + 0.22217543423175812, + 0.5047767758369446, + 0.135724738240242, + 1.1511540412902832, + -0.3541218340396881, + -0.9712511897087096, + 0.8353699445724487, + -0.39227569103240967, + -0.9117669463157654, + -0.26349931955337524, + 0.05597023293375969, + 0.20695461332798004, + 0.3178807199001312, + 1.0663238763809204, + 0.5062212347984314, + 0.7288597822189331, + 0.09899299591779707, + 0.553720235824585, + 0.675009548664093, + -0.20067055523395536, + 0.3138423264026642, + -0.6886593103408813, + -0.2910398542881012, + -1.3186300992965698, + -0.4684459865093231, + -0.095743365585804, + -0.1257995069026947, + -0.4858281314373016, + -0.4935407340526581, + -0.3266896903514862, + -0.3928797245025635, + -0.40803104639053345, + -0.9975396394729614, + 0.4229583740234375, + 0.37309643626213074, + 0.4431034922599793, + 0.30364808440208435, + -0.3765178918838501, + 0.5616499185562134, + 0.16904796659946442, + -0.7343707084655762, + 0.2560209631919861, + 0.6166825294494629, + 0.3200829327106476, + -0.4483652710914612, + 0.16224201023578644, + -0.31495288014411926, + -0.42713335156440735, + 0.7270734906196594, + 0.7049484848976135, + -0.0571461021900177, + 0.04477125033736229, + -0.6647796034812927, + 1.183672308921814, + 0.36199676990509033, + 0.046881116926670074, + 0.4515796303749085, + 0.9278061985969543, + 0.31471705436706543, + -0.7073333859443665, + -0.3443860113620758, + 0.5440067052841187, + -0.15020819008350372, + -0.541202962398529, + 0.5203295946121216, + 1.2192286252975464, + -0.9983593225479126, + -0.18758884072303772, + 0.2758221924304962, + -0.6511523723602295, + -0.1584404855966568, + -0.236241415143013, + 0.2692437767982483, + -0.4941152036190033, + 0.4987454116344452, + -0.3331359028816223, + 0.3163745701313019, + 0.745529294013977, + -0.2905873656272888, + 0.13602906465530396, + 0.4679684340953827, + 1.0555986166000366, + 1.075700044631958, + 0.5368486046791077, + -0.5118206739425659, + 0.8668332099914551, + -0.5726966857910156, + -0.7811751961708069, + 0.1938626915216446, + -0.1929349899291992, + 0.1757766306400299, + 0.6384295225143433, + 0.26462844014167786, + 0.9542630314826964, + 0.19313029944896695, + 1.264248013496399, + -0.6304428577423096, + 0.0487106591463089, + -0.16211535036563873, + -0.7894763350486755, + 0.3582514822483063, + -0.04153040423989296, + 0.635784387588501, + 0.6554391980171204, + -0.47010496258735657, + -0.8302040696144104, + -0.1350124627351761, + 0.2568812072277069, + 0.13614831864833832, + -0.2563649117946625, + -1.0434694290161133, + 0.3232482671737671, + 0.47882452607154846, + 0.4298652410507202, + 1.0563770532608032, + -0.28917592763900757, + -0.8533256649971008, + 0.10648339986801147, + 0.6376127004623413, + -0.20832888782024384, + 0.2370245456695557, + 0.0018312990432605147, + -0.2034837007522583, + 0.01051164511591196, + -1.105310082435608, + 0.29724350571632385, + 0.15604574978351593, + 0.1973688006401062, + 0.44394731521606445, + 0.3974513411521912, + -0.13625948131084442, + 0.9571986198425292, + 0.2257384955883026, + 0.2323588728904724, + -0.5583669543266296, + -0.7854922413825989, + 0.1647188365459442, + -1.6098142862319946, + 0.318587988615036, + -0.13399995863437653, + -0.2172701060771942, + -0.767514705657959, + -0.5813586711883545, + -0.3195130527019501, + -0.04894036799669266, + 0.2929930090904236, + -0.8213384747505188, + 0.07181350141763687, + 0.7469993829727173, + 0.6407455801963806, + 0.16365697979927063, + 0.7870153188705444, + 0.6524736881256104, + 0.6399973630905151, + -0.04992736503481865, + -0.03959266096353531, + -0.2512352466583252, + 0.8448855876922607, + -0.1422702670097351, + 0.1216789186000824, + -1.2647287845611572, + 0.5931149125099182, + 0.7186052203178406, + -0.06118432432413101, + -1.1942816972732544, + -0.17677085101604462, + 0.31543800234794617, + -0.32252824306488037, + 0.8255583047866821, + -0.14529970288276672, + -0.2695446312427521, + -0.33378756046295166, + -0.1653425395488739, + 0.1454019844532013, + -0.3920115828514099, + 0.912214994430542, + -0.7279734015464783, + 0.7374742031097412, + 0.933980405330658, + 0.13429680466651917, + -0.514870285987854, + 0.3989711999893189, + -0.11613689363002776, + 0.4022413492202759, + -0.9990655779838562, + -0.33749932050704956, + -0.4334589838981629, + -1.376373291015625, + -0.2993924915790558, + -0.09454808384180068, + -0.01314175222069025, + -0.001090060803107917, + 0.2137461006641388, + 0.2938512861728668, + 0.17508235573768616, + 0.8260607123374939, + -0.7218498587608337, + 0.2414487451314926, + -0.47296759486198425, + -0.3002610504627228, + -1.238540768623352, + 0.08663805574178696, + 0.6805586218833923, + 0.5909030437469482, + -0.42807504534721375, + -0.22887496650218964, + 0.47537800669670105, + -1.0474627017974854, + 0.6338009238243103, + 0.06548397243022919, + 0.4971011281013489, + 1.3484878540039063 + ] + ], + "userProvided": false + } + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap new file mode 100644 index 000000000..f2a5e1d69 --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -0,0 +1,785 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e1", + "desc": "natural vector", + "_vectors": { + "default": { + "embeddings": [ + [ + -0.2979458272457123, + -0.5288640856742859, + -0.019957859069108963, + -0.18495318293571472, + 0.7429973483085632, + 0.5238497257232666, + 0.432366281747818, + 0.32744166254997253, + 0.0020762972999364138, + -0.9507834911346436, + -0.35097137093544006, + 0.08469701558351517, + -1.4176613092422483, + 0.4647577106952667, + -0.69340580701828, + 1.0372896194458008, + 0.3716741800308227, + 0.06031008064746857, + -0.6152024269104004, + 0.007914665155112743, + 0.7954924702644348, + -0.20773003995418549, + 0.09376765787601472, + 0.04508133605122566, + -0.2084471583366394, + -0.1518009901046753, + 0.018195509910583496, + -0.07044368237257004, + -0.18119366466999057, + -0.4480230510234833, + 0.3822529911994934, + 0.1911812424659729, + 0.4674372375011444, + 0.06963984668254852, + -0.09341949224472046, + 0.005675444379448891, + -0.6774799227714539, + -0.7066726684570313, + -0.39256376028060913, + 0.04005039855837822, + 0.2084812968969345, + -0.7872875928878784, + -0.8205880522727966, + 0.2919981777667999, + -0.06004738807678223, + -0.4907574355602264, + -1.5937862396240234, + 0.24249385297298431, + -0.14709846675395966, + -0.11860740929841997, + -0.8299489617347717, + 0.472964346408844, + -0.497518390417099, + -0.22205302119255063, + -0.4196169078350067, + 0.32697558403015137, + -0.360930860042572, + -0.9789686799049376, + 0.1887447088956833, + -0.403737336397171, + 0.18524253368377688, + 0.3768732249736786, + 0.3666233420372009, + 0.3511938452720642, + 0.6985810995101929, + 0.41721710562705994, + 0.09754953533411026, + 0.6204307079315186, + -1.0762996673583984, + -0.06263761967420578, + -0.7376511693000793, + 0.6849768161773682, + -0.1745152473449707, + -0.40449759364128113, + 0.20757411420345304, + -0.8424443006515503, + 0.330015629529953, + 0.3489064872264862, + 1.0954371690750122, + 0.8487558960914612, + 1.1076823472976685, + 0.61430823802948, + 0.4155903458595276, + 0.4111340939998626, + 0.05753209814429283, + -0.06429877132177353, + -0.765606164932251, + -0.41703930497169495, + -0.508820652961731, + 0.19859947264194489, + -0.16607828438282013, + -0.28112146258354187, + 0.11032675206661224, + 0.38809511065483093, + -0.36498191952705383, + -0.48671194911003113, + 0.6755134463310242, + 0.03958442434668541, + 0.4478721618652344, + -0.10335399955511092, + -0.9546685814857484, + -0.6087718605995178, + 0.17498846352100372, + 0.08320838958024979, + -1.4478336572647097, + -0.605027437210083, + -0.5867993235588074, + -0.14711688458919525, + -0.5447602272033691, + -0.026259321719408035, + -0.6997418403625488, + -0.07349082082509995, + 0.10638900846242905, + -0.7133527398109436, + -0.9396815299987792, + 1.087092399597168, + 1.1885089874267578, + 0.4011896848678589, + -0.4089202582836151, + -0.10938972979784012, + 0.6726722121238708, + 0.24576938152313232, + -0.24247920513153076, + 1.1499971151351929, + 0.47813335061073303, + -0.05331678315997124, + 0.32338133454322815, + 0.4870913326740265, + -0.23144258558750153, + -1.2023426294326782, + 0.2349330335855484, + 1.080536961555481, + 0.29334118962287903, + 0.391574501991272, + -0.15818795561790466, + -0.2948290705680847, + -0.024689948186278343, + 0.06602869182825089, + 0.5937030911445618, + -0.047901444137096405, + -0.512734591960907, + -0.35780075192451477, + 0.28751692175865173, + 0.4298716187477112, + 0.9242428541183472, + -0.17208744585514069, + 0.11515070497989656, + -0.0335976779460907, + -0.3422986567020416, + 0.5344581604003906, + 0.19895796477794647, + 0.33001241087913513, + 0.6390730142593384, + -0.6074934005737305, + -0.2553696632385254, + 0.9644920229911804, + 0.2699219584465027, + 0.6403993368148804, + -0.6380003690719604, + -0.027310986071825027, + 0.638815701007843, + 0.27719101309776306, + -0.13553589582443237, + 0.750195324420929, + 0.1224869191646576, + -0.20613941550254825, + 0.8444448709487915, + 0.16200250387191772, + -0.24750925600528717, + -0.739950954914093, + -0.28443849086761475, + -1.176282525062561, + 0.516107976436615, + 0.3774825632572174, + 0.10906043648719788, + 0.07962015271186829, + 0.7384604215621948, + -0.051241904497146606, + 1.1730090379714966, + -0.4828610122203827, + -1.404372215270996, + 0.8811132311820984, + -0.3839482367038727, + 0.022516896948218346, + -0.0491158664226532, + -0.43027013540267944, + 1.2049334049224854, + -0.27309560775756836, + 0.6883630752563477, + 0.8264574408531189, + -0.5020735263824463, + -0.4874092042446137, + 0.6007202863693237, + -0.4965405762195587, + 1.1302915811538696, + 0.032572727650403976, + -0.3731859028339386, + 0.658271849155426, + -0.9023059010505676, + 0.7400162220001221, + 0.014550759457051754, + -0.19699542224407196, + 0.2319706380367279, + -0.789058268070221, + -0.14905710518360138, + -0.5826214551925659, + 0.207652747631073, + -0.4507439732551574, + -0.3163885474205017, + 0.3604124188423157, + -0.45119962096214294, + 0.3428427278995514, + 0.3005594313144684, + -0.36026081442832947, + 1.1014249324798584, + 0.40884315967559814, + 0.34991952776908875, + -0.1806638240814209, + 0.27440476417541504, + -0.7118373513221741, + 0.4645499587059021, + 0.214790478348732, + -0.2343102991580963, + 0.10500429570674896, + -0.28034430742263794, + 1.2267805337905884, + 1.0561333894729614, + -0.497364342212677, + -0.6143305897712708, + 0.24963727593421936, + -0.33136463165283203, + -0.01473914459347725, + 0.495918869972229, + -0.6985538005828857, + -1.0033197402954102, + 0.35937801003456116, + 0.6325868368148804, + -0.6808838844299316, + 1.0354058742523191, + -0.7214401960372925, + -0.33318862318992615, + 0.874398410320282, + -0.6594992280006409, + 0.6830640435218811, + -0.18534131348133087, + 0.024834271520376205, + 0.19901277124881744, + -0.5992477536201477, + -1.2126628160476685, + -0.9245557188987732, + -0.3898217976093292, + -0.1286519467830658, + 0.4217943847179413, + -0.1143646091222763, + 0.5630772709846497, + -0.5240639448165894, + 0.21152715384960177, + -0.3792001008987427, + 0.8266305327415466, + 1.170984387397766, + -0.8072142004966736, + 0.11382893472909927, + -0.17953898012638092, + -0.1789460331201553, + -0.15078622102737427, + -1.2082908153533936, + -0.7812382578849792, + -0.10903695970773696, + 0.7303897142410278, + -0.39054441452026367, + 0.19511254131793976, + -0.09121843427419662, + 0.22400228679180145, + 0.30143046379089355, + 0.1141919493675232, + 0.48112115263938904, + 0.7307931780815125, + 0.09701362252235413, + -0.2795647978782654, + -0.3997688889503479, + 0.5540812611579895, + 0.564578115940094, + -0.40065160393714905, + -0.3629159033298493, + -0.3789091110229492, + -0.7298538088798523, + -0.6996853351593018, + -0.4477842152118683, + -0.289089560508728, + -0.6430277824401855, + 0.2344944179058075, + 0.3742927014827728, + -0.5079357028007507, + 0.28841453790664673, + 0.06515737622976303, + 0.707315981388092, + 0.09498685598373412, + 0.8365515470504761, + 0.10002726316452026, + -0.7695478200912476, + 0.6264724135398865, + 0.7562043070793152, + -0.23112858831882477, + -0.2871039807796478, + -0.25010058283805847, + 0.2783474028110504, + -0.03224996477365494, + -0.9119359850883484, + -3.6940200328826904, + -0.5099936127662659, + -0.1604711413383484, + 0.17453284561634064, + 0.41759559512138367, + 0.1419190913438797, + -0.11362407356500626, + -0.33312007784843445, + 0.11511333286762238, + 0.4667884409427643, + -0.0031647447030991316, + 0.15879854559898376, + 0.3042248487472534, + 0.5404849052429199, + 0.8515422344207764, + 0.06286454200744629, + 0.43790125846862793, + -0.8682025074958801, + -0.06363756954669952, + 0.5547921657562256, + -0.01483887154608965, + -0.07361344993114471, + -0.929947018623352, + 0.3502565622329712, + -0.5080993175506592, + 1.0380364656448364, + -0.2017953395843506, + 0.21319580078125, + -1.0763001441955566, + -0.556368887424469, + 0.1949922740459442, + -0.6445739269256592, + 0.6791343688964844, + 0.21188358962535855, + 0.3736183941364288, + -0.21800459921360016, + 0.7597446441650391, + -0.3732394874095917, + -0.4710160195827484, + 0.025146087631583217, + 0.05341297015547752, + -0.9522109627723694, + -0.6000866889953613, + -0.08469046652317047, + 0.5966026186943054, + 0.3444081246852875, + -0.461188405752182, + -0.5279349088668823, + 0.10296865552663804, + 0.5175143480300903, + -0.20671147108078003, + 0.13392412662506104, + 0.4812754988670349, + 0.2993808686733246, + -0.3005635440349579, + 0.5141698122024536, + -0.6239235401153564, + 0.2877119481563568, + -0.4452739953994751, + 0.5621107816696167, + 0.5047508478164673, + -0.4226335883140564, + -0.18578553199768064, + -1.1967322826385498, + 0.28178197145462036, + -0.8692031502723694, + -1.1812998056411743, + -1.4526212215423584, + 0.4645712077617645, + 0.9327932000160216, + -0.6560136675834656, + 0.461549699306488, + -0.5621527433395386, + -1.328449010848999, + -0.08676894754171371, + 0.00021918353741057217, + -0.18864136934280396, + 0.1259666532278061, + 0.18240638077259064, + -0.14919660985469818, + -0.8965857625007629, + -0.7539900541305542, + 0.013973715715110302, + 0.504276692867279, + -0.704748272895813, + -0.6428424119949341, + 0.6303996443748474, + -0.5404738187789917, + -0.31176653504371643, + -0.21262824535369873, + 0.18736739456653595, + -0.7998970746994019, + 0.039946746081113815, + 0.7390344738960266, + 0.4283199906349182, + 0.3795057237148285, + 0.07204607129096985, + -0.9230587482452391, + 0.9440426230430604, + 0.26272690296173096, + 0.5598306655883789, + -1.0520871877670288, + -0.2677186131477356, + -0.1888762265443802, + 0.30426350235939026, + 0.4746131896972656, + -0.5746733546257019, + -0.4197768568992615, + 0.8565112948417664, + -0.6767723560333252, + 0.23448683321475983, + -0.2010004222393036, + 0.4112907350063324, + -0.6497949957847595, + -0.418667733669281, + -0.4950824975967407, + 0.44438859820365906, + 1.026281714439392, + 0.482397586107254, + -0.26220494508743286, + -0.3640787005424499, + 0.5907743573188782, + -0.8771642446517944, + 0.09708411991596222, + -0.3671700060367584, + 0.4331349730491638, + 0.619417667388916, + -0.2684665620326996, + -0.5123821496963501, + -0.1502324342727661, + -0.012190685607492924, + 0.3580845892429352, + 0.8617186546325684, + 0.3493645489215851, + 1.0270192623138428, + 0.18297909200191495, + -0.5881339311599731, + -0.1733516901731491, + -0.5040576457977295, + -0.340370237827301, + -0.26767754554748535, + -0.28570041060447693, + -0.032928116619586945, + 0.6029254794120789, + 0.17397655546665192, + 0.09346921741962431, + 0.27815181016921997, + -0.46699589490890503, + -0.8148876428604126, + -0.3964351713657379, + 0.3812595009803772, + 0.13547226786613464, + 0.7126688361167908, + -0.3473474085330963, + -0.06573959439992905, + -0.6483767032623291, + 1.4808889627456665, + 0.30924928188323975, + -0.5085946917533875, + -0.8613000512123108, + 0.3048902451992035, + -0.4241599142551422, + 0.15909206867218018, + 0.5764641761779785, + -0.07879110425710678, + 1.015336513519287, + 0.07599356025457382, + -0.7025855779647827, + 0.30047643184661865, + -0.35094937682151794, + 0.2522146999835968, + -0.2338722199201584, + -0.8326804637908936, + -0.13695412874221802, + -0.03452421352267265, + 0.47974953055381775, + -0.18385636806488037, + 0.32438594102859497, + 0.1797013282775879, + 0.787494957447052, + -0.12579888105392456, + -0.07507286965847015, + -0.4389670491218567, + 0.2720070779323578, + 0.8138866424560547, + 0.01974171027541161, + -0.3057698905467987, + -0.6709924936294556, + 0.0885881632566452, + -0.2862754464149475, + 0.03475658595561981, + -0.1285519152879715, + 0.3838353455066681, + -0.2944154739379883, + -0.4204859137535095, + -0.4416137933731079, + 0.13426260650157928, + 0.36733248829841614, + 0.573428750038147, + -0.14928072690963745, + -0.026076916605234143, + 0.33286052942276, + -0.5340145826339722, + -0.17279052734375, + -0.01154550164937973, + -0.6620771884918213, + 0.18390542268753052, + -0.08265615254640579, + -0.2489682286977768, + 0.2429984211921692, + -0.044153645634651184, + -0.986578404903412, + -0.33574509620666504, + -0.5387663841247559, + 0.19767941534519196, + 0.12540718913078308, + -0.3403128981590271, + -0.4154576361179352, + 0.17275673151016235, + 0.09407442808151244, + -0.5414086580276489, + 0.4393929839134216, + 0.1725579798221588, + -0.4998118281364441, + -0.6926208138465881, + 0.16552448272705078, + 0.6659538149833679, + -0.10949844866991044, + 0.986426830291748, + 0.01748848147690296, + 0.4003709554672241, + -0.5430638194084167, + 0.35347291827201843, + 0.6887399554252625, + 0.08274628221988678, + 0.13407137989997864, + -0.591465950012207, + 0.3446292281150818, + 0.6069018244743347, + 0.1935492902994156, + -0.0989871397614479, + 0.07008486241102219, + -0.8503749370574951, + -0.09507356584072112, + 0.6259510517120361, + 0.13934025168418884, + 0.06392545253038406, + -0.4112265408039093, + -0.08475656062364578, + 0.4974113404750824, + -0.30606114864349365, + 1.111435890197754, + -0.018766529858112335, + -0.8422622680664063, + 0.4325508773326874, + -0.2832120656967163, + -0.4859798848628998, + -0.41498348116874695, + 0.015977520495653152, + 0.5292825698852539, + 0.4538311660289765, + 1.1328668594360352, + 0.22632671892642975, + 0.7918671369552612, + 0.33401933312416077, + 0.7306135296821594, + 0.3548600673675537, + 0.12506209313869476, + 0.8573207855224609, + -0.5818327069282532, + -0.6953738927841187, + -1.6171947717666626, + -0.1699674427509308, + 0.6318262815475464, + -0.05671752244234085, + -0.28145185112953186, + -0.3976689279079437, + -0.2041076272726059, + -0.5495951175689697, + -0.5152917504310608, + -0.9309796094894408, + 0.101932130753994, + 0.1367802917957306, + 0.1490798443555832, + 0.5304336547851563, + -0.5082434415817261, + 0.06688683480024338, + 0.14657628536224365, + -0.782435953617096, + 0.2962816655635834, + 0.6965363621711731, + 0.8496337532997131, + -0.3042965829372406, + 0.04343798756599426, + 0.0330701619386673, + -0.5662598013877869, + 1.1086925268173218, + 0.756072998046875, + -0.204134538769722, + 0.2404300570487976, + -0.47848284244537354, + 1.3659011125564575, + 0.5645433068275452, + -0.15836156904697418, + 0.43395575881004333, + 0.5944653749465942, + 1.0043466091156006, + -0.49446743726730347, + -0.5954391360282898, + 0.5341240763664246, + 0.020598189905285835, + -0.4036853015422821, + 0.4473709762096405, + 1.1998231410980225, + -0.9317775368690492, + -0.23321466147899628, + 0.2052552700042725, + -0.7423108816146851, + -0.19917210936546328, + -0.1722569614648819, + -0.034072667360305786, + -0.00671181408688426, + 0.46396249532699585, + -0.1372445821762085, + 0.053376372903585434, + 0.7392690777778625, + -0.38447609543800354, + 0.07497968524694443, + 0.5197252631187439, + 1.3746477365493774, + 0.9060075879096984, + 0.20000585913658145, + -0.4053704142570496, + 0.7497360110282898, + -0.34087055921554565, + -1.101803183555603, + 0.273650586605072, + -0.5125769376754761, + 0.22472351789474487, + 0.480757474899292, + -0.19845178723335263, + 0.8857700824737549, + 0.30752456188201904, + 1.1109285354614258, + -0.6768012642860413, + 0.524367094039917, + -0.22495046257972717, + -0.4224412739276886, + 0.40753406286239624, + -0.23133376240730288, + 0.3297771215438843, + 0.4905449151992798, + -0.6813114285469055, + -0.7543983459472656, + -0.5599071383476257, + 0.14351597428321838, + -0.029278717935085297, + -0.3970443606376648, + -0.303079217672348, + 0.24161772429943085, + 0.008353390730917454, + -0.0062365154735744, + 1.0824860334396362, + -0.3704061508178711, + -1.0337258577346802, + 0.04638749733567238, + 1.163011074066162, + -0.31737643480300903, + 0.013986887410283089, + 0.19223114848136905, + -0.2260770797729492, + -0.210910826921463, + -1.0191949605941772, + 0.22356095910072327, + 0.09353553503751756, + 0.18096882104873657, + 0.14867214858531952, + 0.43408671021461487, + -0.33312076330184937, + 0.8173948526382446, + 0.6428242921829224, + 0.20215003192424777, + -0.6634518504142761, + -0.4132290482521057, + 0.29815030097961426, + -1.579406976699829, + -0.0981958732008934, + -0.03941014781594277, + 0.1709178239107132, + -0.5481140613555908, + -0.5338194966316223, + -0.3528362512588501, + -0.11561278253793716, + -0.21793591976165771, + -1.1570470333099363, + 0.2157980799674988, + 0.42083489894866943, + 0.9639263153076172, + 0.09747201204299928, + 0.15671424567699432, + 0.4034591615200043, + 0.6728067994117737, + -0.5216875672340393, + 0.09657668322324751, + -0.2416689097881317, + 0.747975766658783, + 0.1021689772605896, + 0.11652665585279463, + -1.0484966039657593, + 0.8489304780960083, + 0.7169828414916992, + -0.09012343734502792, + -1.3173753023147583, + 0.057890523225069046, + -0.006231260951608419, + -0.1018214002251625, + 0.936040461063385, + -0.0502331368625164, + -0.4284322261810303, + -0.38209280371665955, + -0.22668412327766416, + 0.0782942995429039, + -0.4881664514541626, + 0.9268959760665894, + 0.001867273123934865, + 0.42261114716529846, + 0.8283362984657288, + 0.4256294071674347, + -0.7965338826179504, + 0.4840078353881836, + -0.19861412048339844, + 0.33977967500686646, + -0.4604192078113556, + -0.3107339143753052, + -0.2839638590812683, + -1.5734281539916992, + 0.005220232997089624, + 0.09239906817674635, + -0.7828494906425476, + -0.1397123783826828, + 0.2576255202293396, + 0.21372435986995697, + -0.23169949650764465, + 0.4016408920288086, + -0.462497353553772, + -0.2186472862958908, + -0.5617868900299072, + -0.3649831712245941, + -1.1585862636566162, + -0.08222806453704834, + 0.931126832962036, + 0.4327389597892761, + -0.46451422572135925, + -0.5430706143379211, + -0.27434298396110535, + -0.9479129314422609, + 0.1845661848783493, + 0.3972720205783844, + 0.4883299469947815, + 1.04031240940094 + ] + ], + "userProvided": false + } + } +} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap new file mode 100644 index 000000000..4bd0e2c3e --- /dev/null +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-8.snap @@ -0,0 +1,780 @@ +--- +source: dump/src/reader/mod.rs +expression: document +--- +{ + "id": "e0", + "desc": "overriden vector", + "_vectors": { + "default": [ + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1 + ] + } +} diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump new file mode 100644 index 000000000..9f8ed2ba1 Binary files /dev/null and b/dump/tests/assets/v6-with-vectors.dump differ diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index fa5b70606..6bfbbb024 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -568,7 +568,7 @@ pub mod tests { insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#); insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#); insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#); - // but it also works with other sequencies + // but it also works with other sequences insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); } diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index c758f1114..21fa34733 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -22,7 +22,6 @@ flate2 = "1.0.28" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.5.0" -puffin = { version = "0.16.0", features = ["serialization"] } rayon = "1.8.1" roaring = { version = "0.10.2", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } @@ -37,7 +36,7 @@ time = { version = "0.3.31", features = [ "macros", ] } tracing = "0.1.40" -ureq = "2.9.1" +ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 3161dc499..181ac49a3 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -13,7 +13,7 @@ We can combine the two tasks in a single batch: 1. import documents X and Y Processing this batch is functionally equivalent to processing the two -tasks individally, but should be much faster since we are only performing +tasks individually, but should be much faster since we are only performing one indexing operation. */ @@ -31,6 +31,9 @@ use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; +use meilisearch_types::milli::vector::parsed_vectors::{ + ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, +}; use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; @@ -526,8 +529,6 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(crate::tests::FailureLocation::InsideCreateBatch)?; - puffin::profile_function!(); - let enqueued = &self.get_status(rtxn, Status::Enqueued)?; let to_cancel = self.get_kind(rtxn, Kind::TaskCancelation)? & enqueued; @@ -636,8 +637,6 @@ impl IndexScheduler { self.breakpoint(crate::Breakpoint::InsideProcessBatch); } - puffin::profile_function!(batch.to_string()); - match batch { Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => { // 1. Retrieve the tasks that matched the query at enqueue-time. @@ -785,10 +784,12 @@ impl IndexScheduler { let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should - let auth = milli::heed::EnvOpenOptions::new() - .map_size(1024 * 1024 * 1024) // 1 GiB - .max_dbs(2) - .open(&self.auth_path)?; + let auth = unsafe { + milli::heed::EnvOpenOptions::new() + .map_size(1024 * 1024 * 1024) // 1 GiB + .max_dbs(2) + .open(&self.auth_path) + }?; auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot @@ -914,8 +915,55 @@ impl IndexScheduler { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } - let (_id, doc) = ret?; - let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + let (id, doc) = ret?; + + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + 'inject_vectors: { + let embeddings = index.embeddings(&rtxn, id)?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME.to_owned()) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(milli::Error::UserError( + milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&rtxn, std::iter::once(id)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={id}") + } + }, + value: vectors.clone(), + }, + ) + .into()); + }; + + for (embedder_name, embeddings) in embeddings { + // don't change the entry if it already exists, because it was user-provided + vectors.entry(embedder_name).or_insert_with(|| { + let embeddings = ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + ), + user_provided: false, + }; + serde_json::to_value(embeddings).unwrap() + }); + } + } + index_dumper.push_document(&document)?; } @@ -1174,8 +1222,6 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, ) -> Result> { - puffin::profile_function!(); - match operation { IndexOperation::DocumentClear { mut tasks, .. } => { let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index 3be18a3f1..ae8e6728a 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -68,19 +68,6 @@ impl RoFeatures { .into()) } } - - pub fn check_puffin(&self) -> Result<()> { - if self.runtime.export_puffin_reports { - Ok(()) - } else { - Err(FeatureNotEnabledError { - disabled_action: "Outputting Puffin reports to disk", - feature: "export puffin reports", - issue_link: "https://github.com/meilisearch/product/discussions/693", - } - .into()) - } - } } impl FeatureData { diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 988e75b81..d8625a2c7 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -32,7 +32,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { features: _, max_number_of_tasks: _, max_number_of_batched_tasks: _, - puffin_frame: _, wake_up: _, dumps_path: _, snapshots_path: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5704f5354..8a1c2f540 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -33,7 +33,6 @@ pub type Result = std::result::Result; pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; -use std::fs::File; use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; @@ -59,7 +58,6 @@ use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfi use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; -use puffin::FrameView; use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -344,9 +342,6 @@ pub struct IndexScheduler { /// The Authorization header to send to the webhook URL. pub(crate) webhook_authorization_header: Option, - /// A frame to output the indexation profiling files to disk. - pub(crate) puffin_frame: Arc, - /// The path used to create the dumps. pub(crate) dumps_path: PathBuf, @@ -401,7 +396,6 @@ impl IndexScheduler { cleanup_enabled: self.cleanup_enabled, max_number_of_tasks: self.max_number_of_tasks, max_number_of_batched_tasks: self.max_number_of_batched_tasks, - puffin_frame: self.puffin_frame.clone(), snapshots_path: self.snapshots_path.clone(), dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), @@ -453,10 +447,12 @@ impl IndexScheduler { ) }; - let env = heed::EnvOpenOptions::new() - .max_dbs(11) - .map_size(budget.task_db_size) - .open(options.tasks_path)?; + let env = unsafe { + heed::EnvOpenOptions::new() + .max_dbs(11) + .map_size(budget.task_db_size) + .open(options.tasks_path) + }?; let features = features::FeatureData::new(&env, options.instance_features)?; @@ -498,7 +494,6 @@ impl IndexScheduler { env, // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things wake_up: Arc::new(SignalEvent::auto(true)), - puffin_frame: Arc::new(puffin::GlobalFrameView::default()), autobatching_enabled: options.autobatching_enabled, cleanup_enabled: options.cleanup_enabled, max_number_of_tasks: options.max_number_of_tasks, @@ -585,9 +580,9 @@ impl IndexScheduler { } fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool { - if let Ok(env) = + if let Ok(env) = unsafe { heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path) - { + } { env.prepare_for_closing().wait(); true } else { @@ -619,10 +614,6 @@ impl IndexScheduler { run.wake_up.wait(); loop { - let puffin_enabled = run.features().check_puffin().is_ok(); - puffin::set_scopes_on(puffin_enabled); - puffin::GlobalProfiler::lock().new_frame(); - match run.tick() { Ok(TickOutcome::TickAgain(_)) => (), Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(), @@ -634,31 +625,6 @@ impl IndexScheduler { } } } - - // Let's write the previous frame to disk but only if - // the user wanted to profile with puffin. - if puffin_enabled { - let mut frame_view = run.puffin_frame.lock(); - if !frame_view.is_empty() { - let now = OffsetDateTime::now_utc(); - let mut file = match File::create(format!("{}.puffin", now)) { - Ok(file) => file, - Err(e) => { - tracing::error!("{e}"); - continue; - } - }; - if let Err(e) = frame_view.save_to_writer(&mut file) { - tracing::error!("{e}"); - } - if let Err(e) = file.sync_all() { - tracing::error!("{e}"); - } - // We erase this frame view as it is no more useful. We want to - // measure the new frames now that we exported the previous ones. - *frame_view = FrameView::default(); - } - } } }) .unwrap(); @@ -1772,6 +1738,7 @@ mod tests { use big_s::S; use crossbeam::channel::RecvTimeoutError; use file_store::File; + use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; use meilisearch_auth::AuthFilter; use meilisearch_types::document_formats::DocumentFormatError; @@ -1849,7 +1816,7 @@ mod tests { // To be 100% consistent between all test we're going to start the scheduler right now // and ensure it's in the expected starting state. - let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(1)) { + let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(10)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") @@ -1960,7 +1927,7 @@ mod tests { fn advance(&mut self) -> Breakpoint { let (breakpoint_1, b) = match self .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(5)) + .recv_timeout(std::time::Duration::from_secs(50)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { @@ -1981,7 +1948,7 @@ mod tests { let (breakpoint_2, b) = match self .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(5)) + .recv_timeout(std::time::Duration::from_secs(50)) { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { @@ -4980,4 +4947,233 @@ mod tests { ---------------------------------------------------------------------- "###); } + + #[test] + fn import_vectors() { + use meilisearch_types::settings::{Settings, Unchecked}; + use milli::update::Setting; + + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let mut new_settings: Box> = Box::default(); + let mut embedders = BTreeMap::default(); + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), + api_key: Setting::Set(S("My super secret")), + url: Setting::Set(S("http://localhost:7777")), + dimensions: Setting::Set(384), + ..Default::default() + }; + embedders.insert(S("A_fakerest"), Setting::Set(embedding_settings)); + + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}} the {{doc.breed}} best doggo")), + ..Default::default() + }; + embedders.insert(S("B_small_hf"), Setting::Set(embedding_settings)); + + new_settings.embedders = Setting::Set(embedders); + + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings, + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + let (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) = { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, fakerest_config) = configs.get(0).unwrap(); + insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + insta::assert_json_snapshot!(fakerest_config.embedder_options); + let fakerest_name = name.clone(); + + let (name, simple_hf_config) = configs.get(1).unwrap(); + insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + insta::assert_json_snapshot!(simple_hf_config.embedder_options); + let simple_hf_name = name.clone(); + + let configs = index_scheduler.embedders(configs).unwrap(); + let (hf_embedder, _) = configs.get(&simple_hf_name).unwrap(); + let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo")).unwrap(); + let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo")).unwrap(); + let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo")).unwrap(); + (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) + }; + + // add one doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "userProvided": true, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "userProvided": false, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); + + handle.advance_one_successful_batch(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "adding Intel succeeds"); + + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + + // update the doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "kefir", + "breed": "patou", + } + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); + + { + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + // automatically changed to patou + assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + // remained beagle because set to userProvided + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + } + } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap new file mode 100644 index 000000000..718ea229c --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap @@ -0,0 +1,19 @@ +--- +source: index-scheduler/src/lib.rs +expression: doc +--- +{ + "doggo": "kefir", + "breed": "patou", + "_vectors": { + "A_fakerest": { + "embeddings": "[vector]", + "userProvided": true + }, + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap new file mode 100644 index 000000000..bc16fc8be --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap @@ -0,0 +1,20 @@ +--- +source: index-scheduler/src/lib.rs +expression: task.details +--- +{ + "embedders": { + "A_fakerest": { + "source": "rest", + "apiKey": "MyXXXX...", + "dimensions": 384, + "url": "http://localhost:7777" + }, + "B_small_hf": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}} the {{doc.breed}} best doggo" + } + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap new file mode 100644 index 000000000..013115a58 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap @@ -0,0 +1,23 @@ +--- +source: index-scheduler/src/lib.rs +expression: fakerest_config.embedder_options +--- +{ + "Rest": { + "api_key": "My super secret", + "distribution": null, + "dimensions": 384, + "url": "http://localhost:7777", + "query": null, + "input_field": [ + "input" + ], + "path_to_embeddings": [ + "data" + ], + "embedding_object": [ + "embedding" + ], + "input_type": "text" + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap new file mode 100644 index 000000000..712a62c77 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap @@ -0,0 +1,11 @@ +--- +source: index-scheduler/src/lib.rs +expression: simple_hf_config.embedder_options +--- +{ + "HuggingFace": { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": null + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap new file mode 100644 index 000000000..002a42e59 --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap @@ -0,0 +1,19 @@ +--- +source: index-scheduler/src/lib.rs +expression: doc +--- +{ + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + "A_fakerest": { + "embeddings": "[vector]", + "userProvided": true + }, + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap new file mode 100644 index 000000000..bc16fc8be --- /dev/null +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap @@ -0,0 +1,20 @@ +--- +source: index-scheduler/src/lib.rs +expression: task.details +--- +{ + "embedders": { + "A_fakerest": { + "source": "rest", + "apiKey": "MyXXXX...", + "dimensions": 384, + "url": "http://localhost:7777" + }, + "B_small_hf": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}} the {{doc.breed}} best doggo" + } + } +} diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap new file mode 100644 index 000000000..6b285ba56 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -0,0 +1,49 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,2,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,2,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,2,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap new file mode 100644 index 000000000..6f23d96fd --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -0,0 +1,48 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [2,] +succeeded [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,2,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,2,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000001 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap new file mode 100644 index 000000000..5dcb5a4f7 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -0,0 +1,45 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap new file mode 100644 index 000000000..80521df42 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -0,0 +1,44 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [1,] +succeeded [0,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap new file mode 100644 index 000000000..97b669f44 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -0,0 +1,36 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap new file mode 100644 index 000000000..f3ce4b104 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -0,0 +1,40 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,] +---------------------------------------------------------------------- +### Kind: +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 9f6f90db2..260ff6ee4 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { } for index_uid in index_uids { if index_uid == swap.0 { - *index_uid = swap.1.to_owned(); + swap.1.clone_into(index_uid); } else if index_uid == swap.1 { - *index_uid = swap.0.to_owned(); + swap.0.clone_into(index_uid); } } } diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 1eebd3fe9..ef992e836 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -49,7 +49,7 @@ pub fn open_auth_store_env(path: &Path) -> milli::heed::Result let mut options = EnvOpenOptions::new(); options.map_size(AUTH_STORE_SIZE); // 1GB options.max_dbs(2); - options.open(path) + unsafe { options.open(path) } } impl HeedAuthStore { diff --git a/meilisearch-types/src/deserr/mod.rs b/meilisearch-types/src/deserr/mod.rs index 537b24574..bf1aa1da5 100644 --- a/meilisearch-types/src/deserr/mod.rs +++ b/meilisearch-types/src/deserr/mod.rs @@ -26,7 +26,7 @@ pub type DeserrQueryParamError = DeserrError { pub msg: String, diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index eea012331..85a2cd767 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -384,7 +384,6 @@ impl ErrorCode for milli::Error { UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType, - UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::SortError(_) => Code::InvalidSearchSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { @@ -423,7 +422,6 @@ impl ErrorCode for HeedError { HeedError::Mdb(_) | HeedError::Encoding(_) | HeedError::Decoding(_) - | HeedError::InvalidDatabaseTyping | HeedError::DatabaseClosing | HeedError::BadOpenOptions { .. } => Code::Internal, } diff --git a/meilisearch-types/src/features.rs b/meilisearch-types/src/features.rs index 04a5d9d6f..dda9dee51 100644 --- a/meilisearch-types/src/features.rs +++ b/meilisearch-types/src/features.rs @@ -6,7 +6,6 @@ pub struct RuntimeTogglableFeatures { pub vector_store: bool, pub metrics: bool, pub logs_route: bool, - pub export_puffin_reports: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 4a2b11b21..75962c450 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -67,7 +67,6 @@ permissive-json-pointer = { path = "../permissive-json-pointer" } pin-project-lite = "0.2.13" platform-dirs = "0.3.0" prometheus = { version = "0.13.3", features = ["process"] } -puffin = { version = "0.16.0", features = ["serialization"] } rand = "0.8.5" rayon = "1.8.0" regex = "1.10.2" @@ -75,7 +74,7 @@ reqwest = { version = "0.11.23", features = [ "rustls-tls", "json", ], default-features = false } -rustls = "0.21.6" +rustls = "0.21.12" rustls-pemfile = "1.0.2" segment = { version = "0.2.3", optional = true } serde = { version = "1.0.195", features = ["derive"] } diff --git a/meilisearch/src/middleware.rs b/meilisearch/src/middleware.rs index 6707bb6d5..f3e665c29 100644 --- a/meilisearch/src/middleware.rs +++ b/meilisearch/src/middleware.rs @@ -59,10 +59,12 @@ where let request_path = req.path(); let is_registered_resource = req.resource_map().has_resource(request_path); if is_registered_resource { + let request_pattern = req.match_pattern(); + let metric_path = request_pattern.as_ref().map_or(request_path, String::as_str); let request_method = req.method().to_string(); histogram_timer = Some( crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS - .with_label_values(&[&request_method, request_path]) + .with_label_values(&[&request_method, metric_path]) .start_timer(), ); } diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 227b485c5..0e02309fa 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -47,8 +47,6 @@ pub struct RuntimeTogglableFeatures { pub metrics: Option, #[deserr(default)] pub logs_route: Option, - #[deserr(default)] - pub export_puffin_reports: Option, } async fn patch_features( @@ -68,21 +66,13 @@ async fn patch_features( vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store), metrics: new_features.0.metrics.unwrap_or(old_features.metrics), logs_route: new_features.0.logs_route.unwrap_or(old_features.logs_route), - export_puffin_reports: new_features - .0 - .export_puffin_reports - .unwrap_or(old_features.export_puffin_reports), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because // the it renames to camelCase, which we don't want for analytics. // **Do not** ignore fields with `..` or `_` here, because we want to add them in the future. - let meilisearch_types::features::RuntimeTogglableFeatures { - vector_store, - metrics, - logs_route, - export_puffin_reports, - } = new_features; + let meilisearch_types::features::RuntimeTogglableFeatures { vector_store, metrics, logs_route } = + new_features; analytics.publish( "Experimental features Updated".to_string(), @@ -90,7 +80,6 @@ async fn patch_features( "vector_store": vector_store, "metrics": metrics, "logs_route": logs_route, - "export_puffin_reports": export_puffin_reports, }), Some(&req), ); diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a383434a2..34ebe463d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -730,7 +730,7 @@ pub fn perform_search( let mut ids = BTreeSet::new(); for attr in attrs { if attr == "*" { - ids = displayed_ids.clone(); + ids.clone_from(&displayed_ids); break; } diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 6d5044d20..415da0c15 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,8 +85,13 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { - // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web - let search_request = search_request.unwrap(); + let search_request = match search_request { + Some(search_request) => search_request, + // This should never happen while actix-web is running, but it's not a reason to crash + // and it can generate a lot of noise in the tests. + None => continue, + }; + if searches_running < usize::from(parallelism) && queue.is_empty() { searches_running += 1; // if the search requests die it's not a hard error on our side diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 1a31437f8..c8f8ca105 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1859,8 +1859,7 @@ async fn import_dump_v6_containing_experimental_features() { { "vectorStore": false, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); diff --git a/meilisearch/tests/features/mod.rs b/meilisearch/tests/features/mod.rs index 3a9812f30..9548567ff 100644 --- a/meilisearch/tests/features/mod.rs +++ b/meilisearch/tests/features/mod.rs @@ -20,8 +20,7 @@ async fn experimental_features() { { "vectorStore": false, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -32,8 +31,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -44,8 +42,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -57,8 +54,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -70,8 +66,7 @@ async fn experimental_features() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); } @@ -90,8 +85,7 @@ async fn experimental_feature_metrics() { { "vectorStore": false, "metrics": true, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -146,7 +140,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`, `exportPuffinReports`", + "message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 68ae4c0aa..9c50df6e1 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -5,7 +5,10 @@ use crate::common::index::Index; use crate::common::{Server, Value}; use crate::json; -async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> { +async fn index_with_documents_user_provided<'a>( + server: &'a Server, + documents: &Value, +) -> Index<'a> { let index = server.index("test"); let (response, code) = server.set_features(json!({"vectorStore": true})).await; @@ -15,8 +18,7 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); @@ -34,7 +36,38 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde index } -static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { +async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> Index<'a> { + let index = server.index("test"); + + let (response, code) = server.set_features(json!({"vectorStore": true})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ "embedders": {"default": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.title}}, {{doc.desc}}" + }}} )) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + index +} + +static SIMPLE_SEARCH_DOCUMENTS_VEC: Lazy = Lazy::new(|| { json!([ { "title": "Shazam!", @@ -56,7 +89,7 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { }]) }); -static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { +static SINGLE_DOCUMENT_VEC: Lazy = Lazy::new(|| { json!([{ "title": "Shazam!", "desc": "a Captain Marvel ersatz", @@ -65,10 +98,29 @@ static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { }]) }); +static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + }, + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + }]) +}); + #[actix_rt::test] async fn simple_search() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post( @@ -85,8 +137,8 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); - snapshot!(response["semanticHitCount"], @"1"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( @@ -98,10 +150,59 @@ async fn simple_search() { snapshot!(response["semanticHitCount"], @"3"); } +#[actix_rt::test] +async fn simple_search_hf() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = + index.search_post(json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}})).await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###); + snapshot!(response["semanticHitCount"], @"0"); + + let (response, code) = index + .search_post( + // disable ranking score as the vectors between architectures are not equal + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.55}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###); + snapshot!(response["semanticHitCount"], @"1"); + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.8}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###); + snapshot!(response["semanticHitCount"], @"3"); + + let (response, code) = index + .search_post( + json!({"q": "Movie World", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}]"###); + snapshot!(response["semanticHitCount"], @"3"); + + let (response, code) = index + .search_post( + json!({"q": "Wonder replacement", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###); + snapshot!(response["semanticHitCount"], @"3"); +} + #[actix_rt::test] async fn distribution_shift() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); let (response, code) = index.search_post(search.clone()).await; @@ -133,7 +234,7 @@ async fn distribution_shift() { #[actix_rt::test] async fn highlighter() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], @@ -184,7 +285,7 @@ async fn highlighter() { #[actix_rt::test] async fn invalid_semantic_ratio() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let (response, code) = index .search_post( @@ -256,7 +357,7 @@ async fn invalid_semantic_ratio() { #[actix_rt::test] async fn single_document() { let server = Server::new().await; - let index = index_with_documents(&server, &SINGLE_DOCUMENT).await; + let index = index_with_documents_user_provided(&server, &SINGLE_DOCUMENT_VEC).await; let (response, code) = index .search_post( @@ -272,7 +373,7 @@ async fn single_document() { #[actix_rt::test] async fn query_combination() { let server = Server::new().await; - let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; // search without query and vector, but with hybrid => still placeholder let (response, code) = index @@ -331,7 +432,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -374,6 +475,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b4350f686..56fa226b2 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -680,6 +680,26 @@ async fn search_facet_distribution() { }, ) .await; + + index.update_settings(json!({"filterableAttributes": ["doggos.name"]})).await; + index.wait_task(5).await; + + index + .search( + json!({ + "facets": ["doggos.name"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + let dist = response["facetDistribution"].as_object().unwrap(); + assert_eq!(dist.len(), 1); + assert_eq!( + dist["doggos.name"], + json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) + ); + }, + ) + .await; } #[actix_rt::test] @@ -895,9 +915,9 @@ async fn test_score_details() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] }, "_rankingScoreDetails": { @@ -921,7 +941,7 @@ async fn test_score_details() { "order": 3, "attributeRankingOrderScore": 1.0, "queryWordDistanceScore": 0.8095238095238095, - "score": 0.9727891156462584 + "score": 0.8095238095238095 }, "exactness": { "order": 4, @@ -1096,9 +1116,9 @@ async fn experimental_feature_vector_store() { "id": "287947", "_vectors": { "manual": [ - 1, - 2, - 3 + 1.0, + 2.0, + 3.0 ] }, "_rankingScore": 1.0 @@ -1108,9 +1128,9 @@ async fn experimental_feature_vector_store() { "id": "299537", "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] }, "_rankingScore": 0.9129111766815186 @@ -1120,9 +1140,9 @@ async fn experimental_feature_vector_store() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] }, "_rankingScore": 0.8106412887573242 @@ -1132,9 +1152,9 @@ async fn experimental_feature_vector_store() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] }, "_rankingScore": 0.7412010431289673 @@ -1144,9 +1164,9 @@ async fn experimental_feature_vector_store() { "id": "522681", "_vectors": { "manual": [ - 10, - -23, - 32 + 10.0, + -23.0, + 32.0 ] }, "_rankingScore": 0.6972063183784485 @@ -1405,9 +1425,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } @@ -1426,9 +1446,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } @@ -1447,9 +1467,9 @@ async fn simple_search_with_strange_synonyms() { "id": "166428", "_vectors": { "manual": [ - -100, - 231, - 32 + -100.0, + 231.0, + 32.0 ] } } diff --git a/meilisearch/tests/search/multi.rs b/meilisearch/tests/search/multi.rs index aeec1bad4..b5cf8f476 100644 --- a/meilisearch/tests/search/multi.rs +++ b/meilisearch/tests/search/multi.rs @@ -75,9 +75,9 @@ async fn simple_search_single_index() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] } } @@ -96,9 +96,9 @@ async fn simple_search_single_index() { "id": "299537", "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] } } @@ -194,9 +194,9 @@ async fn simple_search_two_indexes() { "id": "450465", "_vectors": { "manual": [ - -100, - 340, - 90 + -100.0, + 340.0, + 90.0 ] } } @@ -227,9 +227,9 @@ async fn simple_search_two_indexes() { "cattos": "pésti", "_vectors": { "manual": [ - 1, - 2, - 3 + 1.0, + 2.0, + 3.0 ] } }, @@ -249,9 +249,9 @@ async fn simple_search_two_indexes() { ], "_vectors": { "manual": [ - 1, - 2, - 54 + 1.0, + 2.0, + 54.0 ] } } diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs index 7bbdca38f..f52efa1f4 100644 --- a/meilisearch/tests/search/restrict_searchable.rs +++ b/meilisearch/tests/search/restrict_searchable.rs @@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() { @r###" [ { - "id": "2" + "id": "1" }, { - "id": "1" + "id": "2" } ] "### diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index cd31d4959..379e0a917 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -98,8 +98,7 @@ async fn secrets_are_hidden_in_settings() { { "vectorStore": true, "metrics": false, - "logsRoute": false, - "exportPuffinReports": false + "logsRoute": false } "###); diff --git a/meilisearch/tests/snapshot/mod.rs b/meilisearch/tests/snapshot/mod.rs index 1312aa9ca..67e80f45b 100644 --- a/meilisearch/tests/snapshot/mod.rs +++ b/meilisearch/tests/snapshot/mod.rs @@ -1,6 +1,5 @@ use std::time::Duration; -use actix_rt::time::sleep; use meili_snap::{json_string, snapshot}; use meilisearch::option::ScheduleSnapshot; use meilisearch::Opt; @@ -53,11 +52,29 @@ async fn perform_snapshot() { index.load_test_set().await; - server.index("test1").create(Some("prim")).await; + let (task, code) = server.index("test1").create(Some("prim")).await; + meili_snap::snapshot!(code, @"202 Accepted"); - index.wait_task(2).await; + index.wait_task(task.uid()).await; - sleep(Duration::from_secs(2)).await; + // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point + + println!("waited for the next task to finish"); + let now = std::time::Instant::now(); + let next_task = task.uid() + 1; + loop { + let (value, code) = index.get_task(next_task).await; + dbg!(&value); + if code != 404 && value["status"].as_str() == Some("succeeded") { + break; + } + + if now.elapsed() > Duration::from_secs(30) { + panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}", + serde_json::to_string_pretty(&value).unwrap() + ); + } + } let temp = tempfile::tempdir().unwrap(); diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index bace7d16b..06c4890a5 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -80,9 +80,7 @@ fn main() -> anyhow::Result<()> { /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) } .with_context(|| format!("While trying to open {:?}", path.display()))?; eprintln!("Deleting tasks from the database..."); @@ -129,7 +127,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { } } - eprintln!("Sucessfully deleted {count} content files from disk!"); + eprintln!("Successfully deleted {count} content files from disk!"); Ok(()) } @@ -193,9 +191,7 @@ fn export_a_dump( FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; let index_scheduler_path = db_path.join("tasks"); - let env = EnvOpenOptions::new() - .max_dbs(100) - .open(&index_scheduler_path) + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; eprintln!("Dumping the keys..."); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 082cd0812..4a08e6261 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,7 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } -heed = { version = "0.20.0-alpha.9", default-features = false, features = [ +heed = { version = "0.20.1", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", @@ -67,9 +67,6 @@ filter-parser = { path = "../filter-parser" } # documents words self-join itertools = "0.11.0" -# profiling -puffin = "0.16.0" - csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } @@ -82,10 +79,10 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.2.0" +arroy = "0.3.1" rand = "0.8.5" tracing = "0.1.40" -ureq = { version = "2.9.6", features = ["json"] } +ureq = { version = "2.9.7", features = ["json"] } url = "2.5.0" [dev-dependencies] diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 8640acf42..3d10ec599 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -48,7 +48,7 @@ fn main() -> Result<(), Box> { let start = Instant::now(); - let mut ctx = SearchContext::new(&index, &txn); + let mut ctx = SearchContext::new(&index, &txn)?; let universe = filtered_universe(&ctx, &None)?; let docs = execute_search( diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore new file mode 100644 index 000000000..a0925114d --- /dev/null +++ b/milli/fuzz/.gitignore @@ -0,0 +1,3 @@ +target +corpus +artifacts diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index e5124f67f..ec4d634aa 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { "string" => (field_name, AllowedType::String), "boolean" => (field_name, AllowedType::Boolean), "number" => (field_name, AllowedType::Number), - // if the pattern isn't reconized, we keep the whole field. + // if the pattern isn't recognized, we keep the whole field. _otherwise => (header, AllowedType::String), }, None => (header, AllowedType::String), diff --git a/milli/src/error.rs b/milli/src/error.rs index e4550de1f..83754afe4 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -32,6 +32,8 @@ pub enum InternalError { DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + #[error("Missing {key} in the fieldids weights mapping.")] + FieldidsWeightsMapMissingEntry { key: FieldId }, #[error(transparent)] FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), #[error("Missing {key} in the field id mapping.")] @@ -46,8 +48,6 @@ pub enum InternalError { GrenadInvalidFormatVersion, #[error("Invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, - #[error("{}", HeedError::InvalidDatabaseTyping)] - InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), #[error(transparent)] @@ -117,10 +117,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidGeoField(#[from] GeoError), #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)] InvalidVectorDimensions { expected: usize, found: usize }, - #[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")] - InvalidVectorsType { document_id: Value, value: Value, subfield: String }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] - InvalidVectorsMapType { document_id: Value, value: Value }, + InvalidVectorsMapType { document_id: String, value: Value }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] @@ -427,7 +425,6 @@ impl From for Error { // TODO use the encoding HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), - HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs new file mode 100644 index 000000000..a737632a4 --- /dev/null +++ b/milli/src/fieldids_weights_map.rs @@ -0,0 +1,48 @@ +//! The fieldids weights map is in charge of storing linking the searchable fields with their weights. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{FieldId, FieldsIdsMap, Weight}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct FieldidsWeightsMap { + map: HashMap, +} + +impl FieldidsWeightsMap { + /// Insert a field id -> weigth into the map. + /// If the map did not have this key present, `None` is returned. + /// If the map did have this key present, the value is updated, and the old value is returned. + pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { + self.map.insert(fid, weight) + } + + /// Create the map from the fields ids maps. + /// Should only be called in the case there are NO searchable attributes. + /// All the fields will be inserted in the order of the fields ids map with a weight of 0. + pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + } + + /// Removes a field id from the map, returning the associated weight previously in the map. + pub fn remove(&mut self, fid: FieldId) -> Option { + self.map.remove(&fid) + } + + /// Returns weight corresponding to the key. + pub fn weight(&self, fid: FieldId) -> Option { + self.map.get(&fid).copied() + } + + /// Returns highest weight contained in the map if any. + pub fn max_weight(&self) -> Option { + self.map.values().copied().max() + } + + /// Return an iterator visiting all field ids in arbitrary order. + pub fn ids(&self) -> impl Iterator + '_ { + self.map.keys().copied() + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index dcab42c0a..1db518c7d 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -195,7 +195,7 @@ mod tests { fn merge_cbo_roaring_bitmaps() { let mut buffer = Vec::new(); - let small_data = vec![ + let small_data = [ RoaringBitmap::from_sorted_iter(1..4).unwrap(), RoaringBitmap::from_sorted_iter(2..5).unwrap(), RoaringBitmap::from_sorted_iter(4..6).unwrap(), @@ -209,7 +209,7 @@ mod tests { let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap(); assert_eq!(bitmap, expected); - let medium_data = vec![ + let medium_data = [ RoaringBitmap::from_sorted_iter(1..4).unwrap(), RoaringBitmap::from_sorted_iter(2..5).unwrap(), RoaringBitmap::from_sorted_iter(4..8).unwrap(), diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..982be0139 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -22,11 +23,12 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::EmbeddingConfig; +use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, + FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -42,6 +44,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -181,7 +184,7 @@ impl Index { options.max_dbs(25); - let env = options.open(path)?; + let env = unsafe { options.open(path) }?; let mut wtxn = env.write_txn()?; let main = env.database_options().name(MAIN).create(&mut wtxn)?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; @@ -291,6 +294,11 @@ impl Index { self.env.read_txn() } + /// Create a static read transaction to be able to read the index without keeping a reference to it. + pub fn static_read_txn(&self) -> heed::Result> { + self.env.clone().static_read_txn() + } + /// Returns the canonicalized path where the heed `Env` of this `Index` lives. pub fn path(&self) -> &Path { self.env.path() @@ -414,6 +422,65 @@ impl Index { .unwrap_or_default()) } + /* fieldids weights map */ + // This maps the fields ids to their weights. + // Their weights is defined by the ordering of the searchable attributes. + + /// Writes the fieldids weights map which associates the field ids to their weights + pub(crate) fn put_fieldids_weights_map( + &self, + wtxn: &mut RwTxn, + map: &FieldidsWeightsMap, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::FIELDIDS_WEIGHTS_MAP_KEY, + map, + ) + } + + /// Get the fieldids weights map which associates the field ids to their weights + pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { + self.main + .remap_types::>() + .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? + .map(Ok) + .unwrap_or_else(|| { + Ok(FieldidsWeightsMap::from_field_id_map_without_searchable( + &self.fields_ids_map(rtxn)?, + )) + }) + } + + /// Delete the fieldsids weights map + pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) + } + + pub fn searchable_fields_and_weights<'a>( + &self, + rtxn: &'a RoTxn, + ) -> Result, FieldId, Weight)>> { + let fid_map = self.fields_ids_map(rtxn)?; + let weight_map = self.fieldids_weights_map(rtxn)?; + let searchable = self.searchable_fields(rtxn)?; + + searchable + .into_iter() + .map(|field| -> Result<_> { + let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field.to_string(), + process: "searchable_fields_and_weights", + })?; + let weight = weight_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; + + Ok((field, fid, weight)) + }) + .collect() + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -578,33 +645,42 @@ impl Index { wtxn: &mut RwTxn, user_fields: &[&str], fields_ids_map: &FieldsIdsMap, - ) -> heed::Result<()> { + ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; + let mut weights = FieldidsWeightsMap::default(); + // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. - let mut real_fields = user_fields.to_vec(); + // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. + let mut real_fields = Vec::new(); - for field_from_map in fields_ids_map.names() { - for user_field in user_fields { + for (id, field_from_map) in fields_ids_map.iter() { + for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) - && !user_fields.contains(&field_from_map) + && !real_fields.contains(&field_from_map) { real_fields.push(field_from_map); + + let weight: u16 = + weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; + weights.insert(id, weight); } } } - self.put_searchable_fields(wtxn, &real_fields) + self.put_searchable_fields(wtxn, &real_fields)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { let did_delete_searchable = self.delete_searchable_fields(wtxn)?; let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + self.delete_fieldids_weights_map(wtxn)?; Ok(did_delete_searchable || did_delete_user_defined) } @@ -623,28 +699,31 @@ impl Index { } /// Returns the searchable fields, those are the fields that are indexed, - /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { self.main .remap_types::>>() - .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? + .map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect())) + .unwrap_or_else(|| { + Ok(self + .fields_ids_map(rtxn)? + .names() + .map(|field| Cow::Owned(field.to_string())) + .collect()) + }) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { - match self.searchable_fields(rtxn)? { - Some(fields) => { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = Vec::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(name) { - fields_ids.push(field_id); - } - } - Ok(Some(fields_ids)) + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.searchable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.push(field_id); } - None => Ok(None), } + Ok(fields_ids) } /// Writes the searchable fields, when this list is specified, only these are indexed. @@ -1527,6 +1606,44 @@ impl Index { pub(crate) fn delete_search_cutoff(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::SEARCH_CUTOFF) } + + pub fn embeddings( + &self, + rtxn: &RoTxn<'_>, + docid: DocumentId, + ) -> Result>> { + let mut res = BTreeMap::new(); + for row in self.embedder_category_id.iter(rtxn)? { + let (embedder_name, embedder_id) = row?; + let embedder_id = (embedder_id as u16) << 8; + let mut embeddings = Vec::new(); + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader?.item_vector(rtxn, docid)?; + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + if !embeddings.is_empty() { + res.insert(embedder_name.to_owned(), embeddings); + } + } + Ok(res) + } } #[cfg(test)] @@ -1710,10 +1827,14 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, field_distribution, 1); + db_snap!(index, field_distribution, @r###" + age 1 | + id 2 | + name 2 | + "###); db_snap!(index, word_docids, - @r###" + @r###" 1 [0, ] 2 [1, ] 20 [1, ] @@ -1722,18 +1843,6 @@ pub(crate) mod tests { "### ); - db_snap!(index, field_distribution); - - db_snap!(index, field_distribution, - @r###" - age 1 | - id 2 | - name 2 | - "### - ); - - // snapshot_index!(&index, "1", include: "^field_distribution$"); - // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1820,7 +1929,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -1840,7 +1949,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -1856,7 +1965,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -2395,6 +2504,14 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); index .add_documents(documents!([ @@ -2410,6 +2527,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); index.delete_documents(Default::default()); @@ -2420,6 +2547,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); index .add_documents(documents!([ @@ -2435,6 +2572,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); @@ -2520,4 +2667,104 @@ pub(crate) mod tests { db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted } + + #[test] + fn swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + index + .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("realName")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + // The order of the field id map shouldn't change + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 3 1 | + "###); + } + + #[test] + fn attribute_weights_after_swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("beverage")]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "name": "kefir", "beverage": "water" }, + { "id": 1, "name": "tamo", "beverage": "kefir" } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find kefir the dog first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("beverage"), S("name")]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find tamo first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 1, + 0, + ] + "###); + } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a1e240464..4d4cdaf9b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,6 +28,7 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +mod fieldids_weights_map; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -52,6 +53,7 @@ pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, @@ -77,6 +79,7 @@ pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Weight = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; @@ -351,43 +354,13 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator bool { - field.starts_with(facet) - && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) + field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.') } pub fn normalize_facet(original: &str) -> String { CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase() } -/// Represents either a vector or an array of multiple vectors. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -#[serde(transparent)] -pub struct VectorOrArrayOfVectors { - #[serde(with = "either::serde_untagged_optional")] - inner: Option, Vec>>>, -} - -impl VectorOrArrayOfVectors { - pub fn into_array_of_vectors(self) -> Option>> { - match self.inner? { - either::Either::Left(vector) => Some(vec![vector]), - either::Either::Right(vectors) => Some(vectors), - } - } -} - -/// Normalize a vector by dividing the dimensions by the length of it. -pub fn normalize_vector(mut vector: Vec) -> Vec { - let squared: f32 = vector.iter().map(|x| x * x).sum(); - let length = squared.sqrt(); - if length <= f32::EPSILON { - vector - } else { - vector.iter_mut().for_each(|x| *x /= length); - vector - } -} - #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bab67e6bd..ca0eda49e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -147,7 +147,7 @@ impl<'a> Search<'a> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { - let ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn)?; filtered_universe(&ctx, &self.filter) } else { Ok(self.execute()?.candidates) @@ -155,10 +155,10 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { - ctx.searchable_attributes(searchable_attributes)?; + ctx.attributes_to_search_on(searchable_attributes)?; } let universe = filtered_universe(&ctx, &self.filter)?; diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 521fcb983..e9bc5449d 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); + ranking_rule_universes[0].clone_from(universe); let mut cur_ranking_rule_index = 0; /// Finish iterating over the current ranking rule, yielding @@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 62c921a1d..4fa0765e0 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -163,7 +163,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -192,7 +192,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -242,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -271,7 +271,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> { .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, @@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 7932f0c2a..41b70ae39 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -184,13 +184,7 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = { - if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { - fids - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().collect() - } - }; + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index 5f5ceb379..4081c9637 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -42,7 +42,7 @@ fn facet_number_values<'a>( } /// Define the strategy used by the geo sort. -/// The paramater represents the cache size, and, in the case of the Dynamic strategy, +/// The parameter represents the cache size, and, in the case of the Dynamic strategy, /// the point where we move from using the iterative strategy to the rtree. #[derive(Debug, Clone, Copy)] pub enum Strategy { diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 56bf6c169..4db1c99c6 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -258,7 +258,7 @@ pub(crate) mod tests { fn matching_words() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2913f206d..40e6f8dc8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> { for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. + // we temporarily save the current token then we try to match the next one. Some(MatchType::Partial(partial)) => { potential_matches.push((token_position, word_position, partial.char_len())); partial @@ -506,7 +506,7 @@ mod tests { impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { - let mut ctx = SearchContext::new(index, rtxn); + let mut ctx = SearchContext::new(index, rtxn).unwrap(); let universe = filtered_universe(&ctx, &None).unwrap(); let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( &mut ctx, @@ -722,7 +722,7 @@ mod tests { @"…void void void void void split the world void void" ); - // Text containing matches with diferent density. + // Text containing matches with different density. let text = "split void the void void world void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 617068ef8..5e4c2f829 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; -use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, - UserError, + UserError, Weight, }; /// A structure used throughout the execution of a search query. @@ -71,8 +70,21 @@ pub struct SearchContext<'ctx> { } impl<'ctx> SearchContext<'ctx> { - pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { - Self { + pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result { + let searchable_fids = index.searchable_fields_and_weights(txn)?; + let exact_attributes_ids = index.exact_attributes_ids(txn)?; + + let mut exact = Vec::new(); + let mut tolerant = Vec::new(); + for (_name, fid, weight) in searchable_fids { + if exact_attributes_ids.contains(&fid) { + exact.push((fid, weight)); + } else { + tolerant.push((fid, weight)); + } + } + + Ok(Self { index, txn, db_cache: <_>::default(), @@ -81,42 +93,39 @@ impl<'ctx> SearchContext<'ctx> { term_interner: <_>::default(), phrase_docids: <_>::default(), restricted_fids: None, - } + }) } - pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { - let fids_map = self.index.fields_ids_map(self.txn)?; - let searchable_names = self.index.searchable_fields(self.txn)?; + pub fn attributes_to_search_on( + &mut self, + attributes_to_search_on: &'ctx [String], + ) -> Result<()> { + let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; + let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; + let mut wildcard = false; + let mut restricted_fids = RestrictedFids::default(); - let mut contains_wildcard = false; - for field_name in searchable_attributes { + for field_name in attributes_to_search_on { if field_name == "*" { - contains_wildcard = true; + wildcard = true; + // we cannot early exit as we want to returns error in case of unknown fields continue; } - let searchable_contains_name = - searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); - let fid = match (fids_map.id(field_name), searchable_contains_name) { + let searchable_weight = + searchable_fields_weights.iter().find(|(name, _, _)| name == field_name); + let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable - (Some(fid), Some(true)) | (Some(fid), None) => fid, - // The field is searchable but the Field id doesn't exist => Internal Error - (None, Some(true)) => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: field_name.to_string(), - process: "search", - } - .into()) - } - // The field is not searchable, but the searchableAttributes are set to * => ignore field - (None, None) => continue, + Some((_name, fid, weight)) => (*fid, *weight), + // The field is not searchable but the user didn't define any searchable attributes + None if user_defined_searchable.is_none() => continue, // The field is not searchable => User error - (_fid, Some(false)) => { - let (valid_fields, hidden_fields) = match searchable_names { - Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, - None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, - }; + None => { + let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( + self.txn, + searchable_fields_weights.iter().map(|(name, _, _)| name), + )?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { @@ -129,13 +138,17 @@ impl<'ctx> SearchContext<'ctx> { }; if exact_attributes_ids.contains(&fid) { - restricted_fids.exact.push(fid); + restricted_fids.exact.push((fid, weight)); } else { - restricted_fids.tolerant.push(fid); + restricted_fids.tolerant.push((fid, weight)); }; } - self.restricted_fids = (!contains_wildcard).then_some(restricted_fids); + if wildcard { + self.restricted_fids = None; + } else { + self.restricted_fids = Some(restricted_fids); + } Ok(()) } @@ -158,13 +171,13 @@ impl Word { #[derive(Debug, Clone, Default)] pub struct RestrictedFids { - pub tolerant: Vec, - pub exact: Vec, + pub tolerant: Vec<(FieldId, Weight)>, + pub exact: Vec<(FieldId, Weight)>, } impl RestrictedFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.contains(fid) || self.exact.contains(fid) + self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } } diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 86be7da77..74b2ed564 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens( if let Some(located_query_term) = phrase.build(ctx) { // as we are evaluating a negative operator we put the phrase // in the negative one *but* we don't reset the negative operator - // as we are immediatly starting a new negative phrase. + // as we are immediately starting a new negative phrase. if negative_phrase { negative_phrases.push(located_query_term); } else { @@ -366,7 +366,7 @@ mod tests { let tokens = tokenizer.tokenize("."); let index = temp_index_with_documents(); let rtxn = index.read_txn()?; - let mut ctx = SearchContext::new(&index, &rtxn); + let mut ctx = SearchContext::new(&index, &rtxn)?; // panics with `attempt to add with overflow` before let ExtractedTokens { query_terms, .. } = located_query_terms_from_tokens(&mut ctx, tokens, None)?; diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 8f3e0cc82..a4a08ea46 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::Result; +use crate::{FieldId, InternalError, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { term: LocatedQueryTermSubset, - fid: u16, + fid: Option, } pub enum FidGraph {} @@ -26,13 +26,15 @@ impl RankingRuleGraphTrait for FidGraph { universe: &RoaringBitmap, ) -> Result { let FidCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = compute_query_term_subset_docids_within_field_id( - ctx, - &term.term_subset, - condition.fid, - )?; - docids &= universe; + + let docids = if let Some(fid) = condition.fid { + // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument + let docids = + compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; + docids & universe + } else { + RoaringBitmap::new() + }; Ok(ComputedCondition { docids, @@ -68,34 +70,29 @@ impl RankingRuleGraphTrait for FidGraph { all_fields.extend(fields); } + let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?; + let mut edges = vec![]; for fid in all_fields.iter().copied() { + let weight = weights_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; edges.push(( - fid as u32 * term.term_ids.len() as u32, - conditions_interner.insert(FidCondition { term: term.clone(), fid }), + weight as u32 * term.term_ids.len() as u32, + conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), )); } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = { - if let Some(max_fid) = ctx - .index - .searchable_fields_ids(ctx.txn)? - .map(|field_ids| field_ids.into_iter().max()) - { - max_fid - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().max() - } - }; + let max_weight: Option = weights_map.max_weight(); - if let Some(max_fid) = max_fid { - if !all_fields.contains(&max_fid) { + if let Some(max_weight) = max_weight { + if !all_fields.contains(&max_weight) { edges.push(( - max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { term: term.clone(), // TODO remove this ugly clone - fid: max_fid, + fid: None, }), )); } diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 38225404c..c595887ba 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -1,5 +1,5 @@ use crate::index::tests::TempIndex; -use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -131,6 +131,19 @@ fn test_attribute_fid_simple() { #[test] fn test_attribute_fid_ngrams() { let index = create_index(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 title | + 2 description | + 3 plot | + "###); + db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + 2 1 | + 3 2 | + "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap new file mode 100644 index 000000000..930a21626 --- /dev/null +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap @@ -0,0 +1,244 @@ +--- +source: milli/src/search/new/tests/attribute_fid.rs +expression: "format!(\"{document_ids_scores:#?}\")" +--- +[ + ( + 2, + [ + Fid( + Rank { + rank: 19, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), + ( + 6, + [ + Fid( + Rank { + rank: 15, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 5, + [ + Fid( + Rank { + rank: 14, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 4, + [ + Fid( + Rank { + rank: 13, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 3, + [ + Fid( + Rank { + rank: 12, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 83, + max_rank: 91, + }, + ), + ], + ), + ( + 9, + [ + Fid( + Rank { + rank: 11, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 8, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 7, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 73, + max_rank: 91, + }, + ), + ], + ), + ( + 11, + [ + Fid( + Rank { + rank: 7, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 10, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 13, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 12, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 78, + max_rank: 91, + }, + ), + ], + ), + ( + 14, + [ + Fid( + Rank { + rank: 5, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 0, + [ + Fid( + Rank { + rank: 1, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), +] diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 28c4cb45c..d79003747 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String { } snap } +pub fn snap_fieldids_weights_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let weights_map = index.fieldids_weights_map(&rtxn).unwrap(); + + let mut snap = String::new(); + writeln!(&mut snap, "fid weight").unwrap(); + let mut field_ids: Vec<_> = weights_map.ids().collect(); + field_ids.sort(); + for field_id in field_ids { + let weight = weights_map.weight(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap(); + } + snap +} +pub fn snap_searchable_fields(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + format!("{searchable_fields:?}") +} pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); @@ -469,6 +488,12 @@ macro_rules! full_snap_of_db { ($index:ident, fields_ids_map) => {{ $crate::snapshot_tests::snap_fields_ids_map(&$index) }}; + ($index:ident, fieldids_weights_map) => {{ + $crate::snapshot_tests::snap_fieldids_weights_map(&$index) + }}; + ($index:ident, searchable_fields) => {{ + $crate::snapshot_tests::snap_searchable_fields(&$index) + }}; ($index:ident, geo_faceted_documents_ids) => {{ $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6715939dc..3490b55e4 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -21,8 +21,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { name = "clear_documents" )] pub fn execute(self) -> Result { - puffin::profile_function!(); - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let Index { env: _env, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 798e0fe3d..f871eee31 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner { ModificationResult::Expand | ModificationResult::Reduce { .. } ) { - // if any modification occured, insert it in the database. + // if any modification occurred, insert it in the database. self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; Ok(insertion_key_modification) } else { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 0af64c4c5..42994551f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -379,7 +379,7 @@ pub(crate) mod test_helpers { let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); + let env = unsafe { options.open(tempdir.path()) }.unwrap(); let mut wtxn = env.write_txn().unwrap(); let content = env.create_database(&mut wtxn, None).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 162136912..2da717bb0 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -29,8 +29,6 @@ pub fn enrich_documents_batch( autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { - puffin::profile_function!(); - let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6af5bba6d..9c557de81 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -29,8 +29,6 @@ pub fn extract_docid_word_positions( settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, ) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { - puffin::profile_function!(); - let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); @@ -186,7 +184,7 @@ fn searchable_fields_changed( ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.contains(&field_id) { let del_add = KvReaderDelAdd::new(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -298,7 +296,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &[FieldId], tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, @@ -309,7 +307,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.as_ref().contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1848a085f..bfd769604 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -23,8 +23,6 @@ pub fn extract_facet_number_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut facet_number_docids_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index abffe17ab..3deace127 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -28,8 +28,6 @@ pub fn extract_facet_string_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let options = NormalizerOption { lossy: true, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index caf53550c..3cbd7e49e 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -37,7 +37,7 @@ pub struct ExtractedFacetValues { /// Extracts the facet values of each faceted field of each document. /// -/// Returns the generated grenad reader containing the docid the fid and the orginal value as key +/// Returns the generated grenad reader containing the docid the fid and the original value as key /// and the normalized value as value extracted from the given chunk of documents. /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially. #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] @@ -46,8 +46,6 @@ pub fn extract_fid_docid_facet_values( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 51e0642da..f252df1cd 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -26,8 +26,6 @@ pub fn extract_fid_word_count_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut fid_word_count_docids_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index b1746898e..f997d6ab7 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -21,8 +21,6 @@ pub fn extract_geo_points( primary_key_id: FieldId, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 322fa3725..76ec90d65 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -10,16 +10,16 @@ use bytemuck::cast_slice; use grenad::Writer; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; -use serde_json::{from_slice, Value}; +use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; -use crate::error::UserError; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::Embedder; -use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors}; +use crate::{DocumentId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -31,6 +31,10 @@ pub struct ExtractedVectorPoints { pub remove_vectors: grenad::Reader>, // docid -> prompt pub prompts: grenad::Reader>, + + // embedder + pub embedder_name: String, + pub embedder: Arc, } enum VectorStateDelta { @@ -65,6 +69,19 @@ impl VectorStateDelta { } } +struct EmbedderVectorExtractor { + embedder_name: String, + embedder: Arc, + prompt: Arc, + + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, + // (docid) -> (prompt) + prompts_writer: Writer>, + // (docid) -> () + remove_vectors_writer: Writer>, +} + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -73,34 +90,52 @@ pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, - prompt: &Prompt, - embedder_name: &str, -) -> Result { - puffin::profile_function!(); +) -> Result> { + let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; let new_fields_ids_map = &settings_diff.new.fields_ids_map; + // the vector field id may have changed + let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); + // filter the old vector fid if the settings has been changed forcing reindexing. + let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); - // (docid, _index) -> KvWriterDelAdd -> Vector - let mut manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // (docid) -> (prompt) - let mut prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut extractors = Vec::new(); + for (embedder_name, (embedder, prompt)) in + settings_diff.new.embedding_configs.clone().into_iter() + { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - // (docid) -> () - let mut remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + }); + } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -114,152 +149,138 @@ pub fn extract_vector_points( key_buffer.clear(); key_buffer.extend_from_slice(docid_bytes); - // since we only needs the primary key when we throw an error we create this getter to + // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - // the vector field id may have changed - let old_vectors_fid = old_fields_ids_map.id("_vectors"); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors()); + let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; - let new_vectors_fid = new_fields_ids_map.id("_vectors"); - let vectors_field = { - let del = old_vectors_fid - .and_then(|vectors_fid| obkv.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id)) - .transpose()? - .flatten(); - let add = new_vectors_fid - .and_then(|vectors_fid| obkv.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id)) - .transpose()? - .flatten(); - (del, add) - }; + for EmbedderVectorExtractor { + embedder_name, + embedder: _, + prompt, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + } in extractors.iter_mut() + { + let delta = match parsed_vectors.remove(embedder_name) { + (Some(old), Some(new)) => { + // no autogeneration + let del_vectors = old.into_array_of_vectors(); + let add_vectors = new.into_array_of_vectors(); - let (del_map, add_map) = vectors_field; - - let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); - let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); - - let delta = match (del_value, add_value) { - (Some(old), Some(new)) => { - // no autogeneration - let del_vectors = extract_vectors(old, document_id, embedder_name)?; - let add_vectors = extract_vectors(new, document_id, embedder_name)?; - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(del_vectors, add_vectors) - } - (Some(_old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else { - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - // was possibly autogenerated, remove all vectors for that document - let add_vectors = extract_vectors(new, document_id, embedder_name)?; - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() - }); - let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); } - } else { - VectorStateDelta::NowRemoved - } - } - }; - // and we finally push the unique vectors into the writer - push_vectors_diff( - &mut remove_vectors_writer, - &mut prompts_writer, - &mut manual_vectors_writer, - &mut key_buffer, - delta, - settings_diff, - )?; + VectorStateDelta::ManualDelta(del_vectors, add_vectors) + } + (Some(_old), None) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + VectorStateDelta::NowRemoved + } + } + (None, Some(new)) => { + // was possibly autogenerated, remove all vectors for that document + let add_vectors = new.into_array_of_vectors(); + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::WasGeneratedNowManual(add_vectors) + } + (None, None) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt) + // TODO: this filter works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + .filter(|_| !settings_diff.reindex_vectors()) + .map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() + }); + let new_prompt = + prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + }; + + // and we finally push the unique vectors into the writer + push_vectors_diff( + remove_vectors_writer, + prompts_writer, + manual_vectors_writer, + &mut key_buffer, + delta, + reindex_vectors, + )?; + } } - Ok(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector - manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () - remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt - prompts: writer_into_reader(prompts_writer)?, - }) -} + let mut results = Vec::new(); -fn to_vector_map( - obkv: KvReaderDelAdd, - side: DelAdd, - document_id: &impl Fn() -> Value, -) -> Result>> { - Ok(if let Some(value) = obkv.get(side) { - let Ok(value) = from_slice(value) else { - let value = from_slice(value).map_err(InternalError::SerdeJson)?; - return Err(crate::Error::UserError(UserError::InvalidVectorsMapType { - document_id: document_id(), - value, - })); - }; - Some(value) - } else { - None - }) + for EmbedderVectorExtractor { + embedder_name, + embedder, + prompt: _, + manual_vectors_writer, + prompts_writer, + remove_vectors_writer, + } in extractors + { + results.push(ExtractedVectorPoints { + // docid, _index -> KvWriterDelAdd -> Vector + manual_vectors: writer_into_reader(manual_vectors_writer)?, + // docid -> () + remove_vectors: writer_into_reader(remove_vectors_writer)?, + // docid -> prompt + prompts: writer_into_reader(prompts_writer)?, + + embedder, + embedder_name, + }) + } + + Ok(results) } /// Computes the diff between both Del and Add numbers and @@ -270,14 +291,13 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - settings_diff: &InnerIndexSettingsDiff, + reindex_vectors: bool, ) -> Result<()> { - puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. - && !settings_diff.reindex_vectors() + && !reindex_vectors { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; @@ -308,7 +328,7 @@ fn push_vectors_diff( EitherOrBoth::Left(vector) => { // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. - if !settings_diff.reindex_vectors() { + if !reindex_vectors { // We insert only the Del part of the Obkv to inform // that we only want to remove all those vectors. let mut obkv = KvWriterDelAdd::memory(); @@ -336,26 +356,6 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) } -/// Extracts the vectors from a JSON value. -fn extract_vectors( - value: Value, - document_id: impl Fn() -> Value, - name: &str, -) -> Result>> { - // FIXME: ugly clone of the vectors here - match serde_json::from_value(value.clone()) { - Ok(vectors) => { - Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default()) - } - Err(_) => Err(UserError::InvalidVectorsType { - document_id: document_id(), - value, - subfield: name.to_owned(), - } - .into()), - } -} - #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] pub fn extract_embeddings( // docid, prompt @@ -364,7 +364,6 @@ pub fn extract_embeddings( embedder: Arc, request_threads: &ThreadPoolNoAbort, ) -> Result>> { - puffin::profile_function!(); let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5699f2fb6..457d2359e 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -36,8 +36,6 @@ pub fn extract_word_docids( grenad::Reader>, grenad::Reader>, )> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut word_fid_docids_sorter = create_sorter( @@ -167,8 +165,6 @@ fn words_into_sorter( add_words: &BTreeSet>, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { - puffin::profile_function!(); - use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 23f70ccd2..617338f9f 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -26,7 +26,6 @@ pub fn extract_word_pair_proximity_docids( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; @@ -71,8 +70,6 @@ pub fn extract_word_pair_proximity_docids( // if we change document, we fill the sorter if current_document_id.map_or(false, |id| id != document_id) { - puffin::profile_scope!("Document into sorter"); - // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter"); let _entered = span.enter(); @@ -163,7 +160,6 @@ pub fn extract_word_pair_proximity_docids( } if let Some(document_id) = current_document_id { - puffin::profile_scope!("Final document into sorter"); // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter"); let _entered = span.enter(); @@ -176,7 +172,6 @@ pub fn extract_word_pair_proximity_docids( )?; } { - puffin::profile_scope!("sorter_into_reader"); // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader"); let _entered = span.enter(); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 45a05b0d0..50b1617f9 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -25,8 +25,6 @@ pub fn extract_word_position_docids( indexer: GrenadParameters, _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - puffin::profile_function!(); - let max_memory = indexer.max_memory_by_thread(); let mut word_position_docids_sorter = create_sorter( @@ -104,8 +102,6 @@ fn words_position_into_sorter( add_word_positions: &BTreeSet<(u16, Vec)>, word_position_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { - puffin::profile_function!(); - use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 237e19b2a..18340a3ae 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -46,8 +46,6 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { - puffin::profile_function!(); - let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { original_obkv_chunks @@ -88,7 +86,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_fid_word_count_docids, TypedChunk::FieldIdWordCountDocids, - "field-id-wordcount-docids", ); run_extraction_task::< _, @@ -115,7 +112,6 @@ pub(crate) fn data_from_obkv_documents( word_fid_docids_reader, } }, - "word-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -125,7 +121,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_word_position_docids, TypedChunk::WordPositionDocids, - "word-position-docids", ); run_extraction_task::< @@ -139,7 +134,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_facet_string_docids, TypedChunk::FieldIdFacetStringDocids, - "field-id-facet-string-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -149,7 +143,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_facet_number_docids, TypedChunk::FieldIdFacetNumberDocids, - "field-id-facet-number-docids", ); run_extraction_task::<_, _, grenad::Reader>>( @@ -159,7 +152,6 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", ); } @@ -183,7 +175,6 @@ fn run_extraction_task( lmdb_writer_sx: Sender>, extract_fn: FE, serialize_fn: FS, - name: &'static str, ) where FE: Fn( grenad::Reader, @@ -201,7 +192,7 @@ fn run_extraction_task( rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); - puffin::profile_scope!("extract_multiple_chunks", name); + match extract_fn(chunk, indexer, &settings_diff) { Ok(chunk) => { let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); @@ -224,27 +215,31 @@ fn send_original_documents_data( let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - let documents_chunk_cloned = original_documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - let request_threads = ThreadPoolNoAbortBuilder::new() .num_threads(crate::vector::REQUEST_PARALLELISM) .thread_name(|index| format!("embedding-request-{index}")) .build()?; - if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() { + let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) + // no point in indexing vectors without embedders + && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); + + if index_vectors { let settings_diff = settings_diff.clone(); + + let original_documents_chunk = original_documents_chunk.clone(); + let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() { - let result = extract_vector_points( - documents_chunk_cloned.clone(), - indexer, - &settings_diff, - &prompt, - &name, - ); - match result { - Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { + match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { + Ok(extracted_vectors) => { + for ExtractedVectorPoints { + manual_vectors, + remove_vectors, + prompts, + embedder_name, + embedder, + } in extracted_vectors + { let embeddings = match extract_embeddings( prompts, indexer, @@ -253,28 +248,26 @@ fn send_original_documents_data( ) { Ok(results) => Some(results), Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); + let _ = lmdb_writer_sx.send(Err(error)); None } }; - if !(remove_vectors.is_empty() && manual_vectors.is_empty() && embeddings.as_ref().map_or(true, |e| e.is_empty())) { - let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, embeddings, expected_dimension: embedder.dimensions(), manual_vectors, - embedder_name: name, + embedder_name, })); } } - - Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); - } + } + Err(error) => { + let _ = lmdb_writer_sx.send(Err(error)); } } }); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index b0e3654a9..aa574024d 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -61,7 +61,6 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result>> { - puffin::profile_function!(); let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -182,8 +181,6 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(); - let mut buffer = Vec::new(); let database = database.remap_types::(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 01684a54a..046498a8b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -6,6 +6,7 @@ mod typed_chunk; use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; +use std::iter; use std::num::NonZeroU32; use std::result::Result as StdResult; use std::sync::Arc; @@ -140,8 +141,6 @@ where mut self, reader: DocumentsBatchReader, ) -> Result<(Self, StdResult)> { - puffin::profile_function!(); - // Early return when there is no document to add if reader.is_empty() { return Ok((self, Ok(0))); @@ -186,8 +185,6 @@ where mut self, to_delete: Vec, ) -> Result<(Self, StdResult)> { - puffin::profile_function!(); - // Early return when there is no document to add if to_delete.is_empty() { // Maintains Invariant: remove documents actually always returns Ok for the inner result @@ -222,8 +219,6 @@ where mut self, to_delete: &RoaringBitmap, ) -> Result<(Self, u64)> { - puffin::profile_function!(); - // Early return when there is no document to add if to_delete.is_empty() { return Ok((self, 0)); @@ -248,8 +243,6 @@ where name = "index_documents" )] pub fn execute(mut self) -> Result { - puffin::profile_function!(); - if self.added_documents == 0 && self.deleted_documents == 0 { let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); @@ -278,8 +271,6 @@ where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let TransformOutput { primary_key, mut settings_diff, @@ -337,7 +328,10 @@ where let min_chunk_size = 1024 * 512; // 512KiB // compute the chunk size from the number of available threads and the inputed data size. - let total_size = flattened_documents.metadata().map(|m| m.len()); + let total_size = match flattened_documents.as_ref() { + Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()), + None => Ok(default_chunk_size as u64), + }; let current_num_threads = pool.current_num_threads(); // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count let chunk_count = if current_num_threads > 2 { @@ -351,8 +345,14 @@ where } }; - let original_documents = grenad::Reader::new(original_documents)?; - let flattened_documents = grenad::Reader::new(flattened_documents)?; + let original_documents = match original_documents { + Some(original_documents) => Some(grenad::Reader::new(original_documents)?), + None => None, + }; + let flattened_documents = match flattened_documents { + Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?), + None => None, + }; let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; @@ -371,15 +371,23 @@ where pool.install(|| { rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); - let _enter = child_span.enter(); - puffin::profile_scope!("extract_and_send_grenad_chunks"); - // split obkv file into several chunks - let original_chunk_iter = - grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + let _enter = child_span.enter(); // split obkv file into several chunks - let flattened_chunk_iter = - grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); + let original_chunk_iter = match original_documents { + Some(original_documents) => { + grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; + + // split obkv file into several chunks + let flattened_chunk_iter = match flattened_documents { + Some(flattened_documents) => { + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; let result = original_chunk_iter.and_then(|original_chunk| { let flattened_chunk = flattened_chunk_iter?; @@ -533,7 +541,7 @@ where let writer_index = (embedder_index as u16) << 8; for k in 0..=u8::MAX { let writer = - arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?; + arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension); if writer.is_empty(wtxn)? { break; } @@ -571,8 +579,6 @@ where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; @@ -616,7 +622,6 @@ where { let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs"); let _entered = span.enter(); - puffin::profile_scope!("compute_prefix_diffs"); current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; @@ -756,8 +761,6 @@ fn execute_word_prefix_docids( common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); - let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); builder.chunk_compression_type = indexer_config.chunk_compression_type; builder.chunk_compression_level = indexer_config.chunk_compression_level; @@ -3237,6 +3240,7 @@ mod tests { } #[test] + #[cfg(feature = "all-tokenizations")] fn stored_detected_script_and_language_should_not_return_deleted_documents() { use charabia::{Language, Script}; let index = TempIndex::new(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8a3463e6f..41a0a55cf 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -20,21 +20,21 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{ - del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd, -}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; +use crate::{ + is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, +}; pub struct TransformOutput { pub primary_key: String, pub settings_diff: InnerIndexSettingsDiff, pub field_distribution: FieldDistribution, pub documents_count: usize, - pub original_documents: File, - pub flattened_documents: File, + pub original_documents: Option, + pub flattened_documents: Option, } /// Extract the external ids, deduplicate and compute the new internal documents ids @@ -161,8 +161,6 @@ impl<'a, 'i> Transform<'a, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; @@ -375,8 +373,6 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // there may be duplicates in the documents to remove. to_remove.sort_unstable(); to_remove.dedup(); @@ -466,8 +462,6 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - let mut documents_deleted = 0; let mut document_sorter_value_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -686,8 +680,6 @@ impl<'a, 'i> Transform<'a, 'i> { where F: Fn(UpdateIndexingStep) + Sync, { - puffin::profile_function!(); - let primary_key = self .index .primary_key(wtxn)? @@ -808,11 +800,15 @@ impl<'a, 'i> Transform<'a, 'i> { })?; let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; + let fields_ids_map = self.fields_ids_map; + let primary_key_id = self.index.primary_key(wtxn)?.and_then(|name| fields_ids_map.id(name)); let mut new_inner_settings = old_inner_settings.clone(); - new_inner_settings.fields_ids_map = self.fields_ids_map; + new_inner_settings.fields_ids_map = fields_ids_map; + let settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated: false, settings_update_only: false, }; @@ -822,10 +818,12 @@ impl<'a, 'i> Transform<'a, 'i> { settings_diff, field_distribution, documents_count: self.documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, - flattened_documents: flattened_documents - .into_inner() - .map_err(|err| err.into_error())?, + original_documents: Some( + original_documents.into_inner().map_err(|err| err.into_error())?, + ), + flattened_documents: Some( + flattened_documents.into_inner().map_err(|err| err.into_error())?, + ), }) } @@ -835,34 +833,66 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, - original_obkv_buffer: &mut Vec, - flattened_obkv_buffer: &mut Vec, + modified_faceted_fields: &HashSet, + original_obkv_buffer: Option<&mut Vec>, + flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { - let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); - let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); + // Always keep the primary key. + let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; + + // If only the `searchableAttributes` has been changed, keep only the searchable fields. + let must_reindex_searchables = settings_diff.reindex_searchable(); + let necessary_searchable_field = |id: FieldId| -> bool { + must_reindex_searchables + && (settings_diff.old.searchable_fields_ids.contains(&id) + || settings_diff.new.searchable_fields_ids.contains(&id)) + }; + + // If only a faceted field has been added, keep only this field. + let must_reindex_facets = settings_diff.reindex_facets(); + let necessary_faceted_field = |id: FieldId| -> bool { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + must_reindex_facets + && modified_faceted_fields + .iter() + .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) + }; + + // Alway provide all fields when vectors are involved because + // we need the fields for the prompt/templating. + let reindex_vectors = settings_diff.reindex_vectors(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. - for (id, name) in new_fields_ids_map.iter() { - if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { + for (id, val) in old_obkv.iter() { + if is_primary_key(id) + || necessary_searchable_field(id) + || necessary_faceted_field(id) + || reindex_vectors + { obkv_writer.insert(id, val)?; } } let data = obkv_writer.into_inner()?; - let new_obkv = KvReader::::new(&data); + let obkv = KvReader::::new(&data); - // take the non-flattened version if flatten_from_fields_ids_map returns None. - let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?; - let old_flattened = - old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::::new); - let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?; - let new_flattened = - new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::::new); + if let Some(original_obkv_buffer) = original_obkv_buffer { + original_obkv_buffer.clear(); + into_del_add_obkv(obkv, DelAddOperation::DeletionAndAddition, original_obkv_buffer)?; + } - original_obkv_buffer.clear(); - flattened_obkv_buffer.clear(); + if let Some(flattened_obkv_buffer) = flattened_obkv_buffer { + // take the non-flattened version if flatten_from_fields_ids_map returns None. + let mut fields_ids_map = settings_diff.new.fields_ids_map.clone(); + let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?; + let flattened = flattened.as_deref().map_or(obkv, KvReader::new); - del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; - del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; + flattened_obkv_buffer.clear(); + into_del_add_obkv( + flattened, + DelAddOperation::DeletionAndAddition, + flattened_obkv_buffer, + )?; + } Ok(()) } @@ -891,46 +921,63 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We initialize the sorter with the user indexing settings. - let mut original_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - keep_first, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory.map(|mem| mem / 2), - ); + let mut original_sorter = if settings_diff.reindex_vectors() { + Some(create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), + )) + } else { + None + }; // We initialize the sorter with the user indexing settings. - let mut flattened_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - keep_first, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory.map(|mem| mem / 2), - ); + let mut flattened_sorter = + if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { + Some(create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), + )) + } else { + None + }; - let mut original_obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); - let mut document_sorter_key_buffer = Vec::new(); - for result in self.index.external_documents_ids().iter(wtxn)? { - let (external_id, docid) = result?; - let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; + if original_sorter.is_some() || flattened_sorter.is_some() { + let modified_faceted_fields = settings_diff.modified_faceted_fields(); + let mut original_obkv_buffer = Vec::new(); + let mut flattened_obkv_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + for result in self.index.external_documents_ids().iter(wtxn)? { + let (external_id, docid) = result?; + let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; - Self::rebind_existing_document( - old_obkv, - &settings_diff, - &mut original_obkv_buffer, - &mut flattened_obkv_buffer, - )?; + Self::rebind_existing_document( + old_obkv, + &settings_diff, + &modified_faceted_fields, + Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), + Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), + )?; - document_sorter_key_buffer.clear(); - document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); - document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; - flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + if let Some(original_sorter) = original_sorter.as_mut() { + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + } + if let Some(flattened_sorter) = flattened_sorter.as_mut() { + flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + } + } } let grenad_params = GrenadParameters { @@ -941,17 +988,22 @@ impl<'a, 'i> Transform<'a, 'i> { }; // Once we have written all the documents, we merge everything into a Reader. - let original_documents = sorter_into_reader(original_sorter, grenad_params)?; - - let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; + let flattened_documents = match flattened_sorter { + Some(flattened_sorter) => Some(sorter_into_reader(flattened_sorter, grenad_params)?), + None => None, + }; + let original_documents = match original_sorter { + Some(original_sorter) => Some(sorter_into_reader(original_sorter, grenad_params)?), + None => None, + }; Ok(TransformOutput { primary_key, field_distribution, settings_diff, documents_count, - original_documents: original_documents.into_inner().into_inner(), - flattened_documents: flattened_documents.into_inner().into_inner(), + original_documents: original_documents.map(|od| od.into_inner().into_inner()), + flattened_documents: flattened_documents.map(|fd| fd.into_inner().into_inner()), }) } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6aad290e5..27f760c2a 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -118,65 +118,6 @@ impl TypedChunk { } } -impl TypedChunk { - pub fn to_debug_string(&self) -> String { - match self { - TypedChunk::FieldIdDocidFacetStrings(grenad) => { - format!("FieldIdDocidFacetStrings {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdDocidFacetNumbers(grenad) => { - format!("FieldIdDocidFacetNumbers {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::Documents(grenad) => { - format!("Documents {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdWordCountDocids(grenad) => { - format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => format!( - "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", - word_docids_reader.len(), - exact_word_docids_reader.len(), - word_fid_docids_reader.len() - ), - TypedChunk::WordPositionDocids(grenad) => { - format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordPairProximityDocids(grenad) => { - format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetStringDocids((grenad, _)) => { - format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetNumberDocids(grenad) => { - format!("FieldIdFacetNumberDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetExistsDocids(grenad) => { - format!("FieldIdFacetExistsDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetIsNullDocids(grenad) => { - format!("FieldIdFacetIsNullDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::FieldIdFacetIsEmptyDocids(grenad) => { - format!("FieldIdFacetIsEmptyDocids {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::GeoPoints(grenad) => { - format!("GeoPoints {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension, embedder_name } => { - format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {}, embedder_name: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension, embedder_name) - } - TypedChunk::ScriptLanguageDocids(sl_map) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) - } - } - } -} - /// Write typed chunk in the corresponding LMDB database of the provided index. /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] @@ -185,14 +126,16 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, wtxn: &mut RwTxn, ) -> Result<(RoaringBitmap, bool)> { - puffin::profile_function!(typed_chunks[0].to_debug_string()); - let mut is_merged_database = false; match typed_chunks[0] { TypedChunk::Documents(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "documents"); let _entered = span.enter(); + let fields_ids_map = index.fields_ids_map(wtxn)?; + let vectors_fid = + fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); for typed_chunk in typed_chunks { let TypedChunk::Documents(chunk) = typed_chunk else { @@ -206,6 +149,10 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; + + let embedders: BTreeSet<_> = + index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); @@ -219,7 +166,35 @@ pub(crate) fn write_typed_chunk_into_index( let del_add_reader = KvReaderDelAdd::new(value); if let Some(addition) = del_add_reader.get(DelAdd::Addition) { - writer.insert(field_id, addition)?; + let addition = if vectors_fid == Some(field_id) { + 'vectors: { + vectors_buffer.clear(); + let Ok(mut vectors) = + crate::vector::parsed_vectors::ParsedVectors::from_bytes( + addition, + ) + else { + // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is + break 'vectors Some(addition); + }; + vectors.retain_user_provided_vectors(&embedders); + let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; + if vectors.is_empty() { + // skip writing empty `_vectors` map + break 'vectors None; + } + + serde_json::to_writer(&mut vectors_buffer, &vectors) + .map_err(InternalError::SerdeJson)?; + Some(vectors_buffer.as_slice()) + } + } else { + Some(addition) + }; + + if let Some(addition) = addition { + writer.insert(field_id, addition)?; + } } } @@ -661,7 +636,7 @@ pub(crate) fn write_typed_chunk_into_index( )?; let writer_index = (embedder_index as u16) << 8; // FIXME: allow customizing distance - let writers: std::result::Result, _> = (0..=u8::MAX) + let writers: Vec<_> = (0..=u8::MAX) .map(|k| { arroy::Writer::new( index.vector_arroy, @@ -670,7 +645,6 @@ pub(crate) fn write_typed_chunk_into_index( ) }) .collect(); - let writers = writers?; // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); @@ -842,7 +816,6 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(); let mut buffer = Vec::new(); let database = database.remap_types::(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index d68bbb939..24b32b6fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -398,8 +398,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { - puffin::profile_function!(); - // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(self.wtxn)? == 0 { @@ -461,50 +459,39 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(true) } - /// Updates the index's searchable attributes. This causes the field map to be recomputed to - /// reflect the order of the searchable attributes. + /// Updates the index's searchable attributes. fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else let old_fields = self.index.searchable_fields(self.wtxn)?; - let did_change = match old_fields { - // If old_fields is Some, let's check to see if the fields actually changed - Some(old_fields) => { - let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields - } - // If old_fields is None, the fields have changed (because they are being set) - None => true, + let did_change = { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields }; if !did_change { return Ok(false); } - // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (distinct_fields, filterable_fields). - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - let mut new_fields_ids_map = FieldsIdsMap::new(); + // Since we're updating the settings we can only add new fields at the end of the field id map + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + // The fields ids map won't change the field id of already present elements thus only the + // new fields will be inserted. + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, - &new_fields_ids_map, + &fields_ids_map, )?; - self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -1078,10 +1065,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage let embedding_configs_updated = self.update_embedding_configs()?; - let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + new_inner_settings.recompute_facets(self.wtxn, self.index)?; + + let primary_key_id = self + .index + .primary_key(self.wtxn)? + .and_then(|name| new_inner_settings.fields_ids_map.id(name)); let inner_settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated, settings_update_only: true, }; @@ -1097,10 +1091,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, - + pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, - pub(crate) settings_update_only: bool, } @@ -1110,13 +1103,8 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - self.old - .fields_ids_map - .iter() - .zip(self.new.fields_ids_map.iter()) - .any(|(old, new)| old != new) - || self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) || self.old.allowed_separators != self.new.allowed_separators || self.old.dictionary != self.new.dictionary || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields @@ -1143,15 +1131,7 @@ impl InnerIndexSettingsDiff { return true; } - let faceted_updated = - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields); - - self.old - .fields_ids_map - .iter() - .zip(self.new.fields_ids_map.iter()) - .any(|(old, new)| old != new) - || faceted_updated + (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) } pub fn reindex_vectors(&self) -> bool { @@ -1181,7 +1161,7 @@ pub(crate) struct InnerIndexSettings { pub user_defined_faceted_fields: HashSet, pub user_defined_searchable_fields: Option>, pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Option>, + pub searchable_fields_ids: Vec, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, @@ -1262,18 +1242,21 @@ impl InnerIndexSettings { // find and insert the new field ids pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + let searchable_fields = self + .user_defined_searchable_fields + .as_ref() + .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); + // in case new fields were introduced we're going to recreate the searchable fields. - if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { - let searchable_fields = - searchable_fields.iter().map(String::as_ref).collect::>(); + if let Some(searchable_fields) = searchable_fields { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, &self.fields_ids_map, )?; - let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - self.searchable_fields_ids = searchable_fields_ids; } + let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + self.searchable_fields_ids = searchable_fields_ids; Ok(()) } @@ -1546,12 +1529,13 @@ mod tests { use big_s::S; use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; + use meili_snap::snapshot; use super::*; use crate::error::Error; use crate::index::tests::TempIndex; use crate::update::ClearDocuments; - use crate::{Criterion, Filter, SearchResult}; + use crate::{db_snap, Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { @@ -1580,6 +1564,17 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + "###); + // Check that the searchable field is correctly set to "name" only. let rtxn = index.read_txn().unwrap(); // When we search for something that is not in @@ -1591,8 +1586,9 @@ mod tests { // we must find the appropriate document. let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); assert_eq!(documents.len(), 1); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); drop(rtxn); // We change the searchable fields to be the "name" field only. @@ -1602,14 +1598,31 @@ mod tests { }) .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + 2 0 | + "###); + // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); + // the searchable fields should contain all the fields let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(searchable_fields, None); + snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); } #[test] diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 1db066058..925635f80 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -52,8 +52,6 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); - // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index 272d465fd..9b6aa21ae 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -57,7 +57,6 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - puffin::profile_function!(); debug!("Computing and writing the word levels integers docids into LMDB on disk..."); let mut prefix_integer_docids_sorter = create_sorter( diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 8b438cef3..d47d6d14c 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -45,8 +45,6 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { name = "words_prefix_fst" )] pub fn execute(self) -> Result<()> { - puffin::profile_function!(); - let words_fst = self.index.words_fst(self.wtxn)?; let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 306c1c1e9..1922bb389 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -13,6 +13,7 @@ pub mod error; pub mod hf; pub mod manual; pub mod openai; +pub mod parsed_vectors; pub mod settings; pub mod ollama; @@ -147,6 +148,10 @@ impl EmbeddingConfigs { self.get(self.get_default_embedder_name()) } + pub fn inner_as_ref(&self) -> &HashMap, Arc)> { + &self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs new file mode 100644 index 000000000..2c61baa9e --- /dev/null +++ b/milli/src/vector/parsed_vectors.rs @@ -0,0 +1,207 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use obkv::KvReader; +use serde_json::{from_slice, Value}; + +use super::Embedding; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::{FieldId, InternalError, UserError}; + +pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(untagged)] +pub enum Vectors { + ImplicitlyUserProvided(VectorOrArrayOfVectors), + Explicit(ExplicitVectors), +} + +impl Vectors { + pub fn into_array_of_vectors(self) -> Vec { + match self { + Vectors::ImplicitlyUserProvided(embeddings) + | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { + embeddings.into_array_of_vectors().unwrap_or_default() + } + } + } +} + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct ExplicitVectors { + pub embeddings: VectorOrArrayOfVectors, + pub user_provided: bool, +} + +pub struct ParsedVectorsDiff { + pub old: Option>, + pub new: Option>, +} + +impl ParsedVectorsDiff { + pub fn new( + documents_diff: KvReader<'_, FieldId>, + old_vectors_fid: Option, + new_vectors_fid: Option, + ) -> Result { + let old = match old_vectors_fid + .and_then(|vectors_fid| documents_diff.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) + .transpose() + { + Ok(del) => del, + // ignore wrong shape for old version of documents, use an empty map in this case + Err(Error::InvalidMap(value)) => { + tracing::warn!(%value, "Previous version of the `_vectors` field had a wrong shape"); + Default::default() + } + Err(error) => { + return Err(error); + } + } + .flatten(); + let new = new_vectors_fid + .and_then(|vectors_fid| documents_diff.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) + .transpose()? + .flatten(); + Ok(Self { old, new }) + } + + pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { + let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); + let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + (old, new) + } +} + +pub struct ParsedVectors(pub BTreeMap); + +impl ParsedVectors { + pub fn from_bytes(value: &[u8]) -> Result { + let Ok(value) = from_slice(value) else { + let value = from_slice(value).map_err(Error::InternalSerdeJson)?; + return Err(Error::InvalidMap(value)); + }; + Ok(ParsedVectors(value)) + } + + pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, v| match v { + Vectors::ImplicitlyUserProvided(_) => true, + Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { + *user_provided + // if the embedder is not in the config, then never touch it + || !embedders.contains(k) + } + }); + } +} + +pub enum Error { + InvalidMap(Value), + InternalSerdeJson(serde_json::Error), +} + +impl Error { + pub fn to_crate_error(self, document_id: String) -> crate::Error { + match self { + Error::InvalidMap(value) => { + crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) + } + Error::InternalSerdeJson(error) => { + crate::Error::InternalError(InternalError::SerdeJson(error)) + } + } + } +} + +fn to_vector_map( + obkv: KvReaderDelAdd, + side: DelAdd, +) -> Result>, Error> { + Ok(if let Some(value) = obkv.get(side) { + let ParsedVectors(parsed_vectors) = ParsedVectors::from_bytes(value)?; + Some(parsed_vectors) + } else { + None + }) +} + +/// Represents either a vector or an array of multiple vectors. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(transparent)] +pub struct VectorOrArrayOfVectors { + #[serde(with = "either::serde_untagged_optional")] + inner: Option, Embedding>>, +} + +impl VectorOrArrayOfVectors { + pub fn into_array_of_vectors(self) -> Option> { + match self.inner? { + either::Either::Left(vectors) => Some(vectors), + either::Either::Right(vector) => Some(vec![vector]), + } + } + + pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { + Self { inner: Some(either::Either::Left(array_of_vec)) } + } +} + +#[cfg(test)] +mod test { + use super::VectorOrArrayOfVectors; + + #[test] + fn array_of_vectors() { + let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap(); + let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap(); + let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap(); + let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap(); + let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap(); + let two_vecs: VectorOrArrayOfVectors = + serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); + + insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); + insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]"); + insta::assert_json_snapshot!(one.into_array_of_vectors(), @r###" + [ + [ + 0.1 + ] + ] + "###); + insta::assert_json_snapshot!(two.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ] + ] + "###); + insta::assert_json_snapshot!(one_vec.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ] + ] + "###); + insta::assert_json_snapshot!(two_vecs.into_array_of_vectors(), @r###" + [ + [ + 0.1, + 0.2 + ], + [ + 0.3, + 0.4 + ] + ] + "###); + } +} diff --git a/workloads/hackernews.json b/workloads/hackernews.json index 0a99b69ff..5762a7309 100644 --- a/workloads/hackernews.json +++ b/workloads/hackernews.json @@ -54,7 +54,7 @@ "sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe" } }, - "commands": [ + "precommands": [ { "route": "indexes/movies/settings", "method": "PATCH", @@ -78,8 +78,10 @@ ] } }, - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/movies-nothreads.json b/workloads/movies-nothreads.json index 175daacf9..d3dd006fa 100644 --- a/workloads/movies-nothreads.json +++ b/workloads/movies-nothreads.json @@ -11,7 +11,7 @@ "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" } }, - "commands": [ + "precommands": [ { "route": "indexes/movies/settings", "method": "PATCH", @@ -30,8 +30,10 @@ ] } }, - "synchronous": "DontWait" - }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/movies-subset-hf-embeddings.json index d24bc752c..d7672cf73 100644 --- a/workloads/movies-subset-hf-embeddings.json +++ b/workloads/movies-subset-hf-embeddings.json @@ -11,7 +11,7 @@ "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" } }, - "commands": [ + "precommands": [ { "route": "experimental-features", "method": "PATCH", @@ -55,7 +55,9 @@ } }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", diff --git a/workloads/settings-add-embeddings.json b/workloads/settings-add-embeddings.json index f87286943..6ad50769a 100644 --- a/workloads/settings-add-embeddings.json +++ b/workloads/settings-add-embeddings.json @@ -11,7 +11,7 @@ "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" } }, - "commands": [ + "precommands": [ { "route": "experimental-features", "method": "PATCH", @@ -49,7 +49,9 @@ "asset": "movies-100.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/settings-add-remove-filters.json b/workloads/settings-add-remove-filters.json index 12493a8fc..f017ed960 100644 --- a/workloads/settings-add-remove-filters.json +++ b/workloads/settings-add-remove-filters.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -59,7 +59,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-proximity-precision.json b/workloads/settings-proximity-precision.json index 384f99e37..ac6d98da0 100644 --- a/workloads/settings-proximity-precision.json +++ b/workloads/settings-proximity-precision.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -61,7 +61,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-remove-add-swap-searchable.json b/workloads/settings-remove-add-swap-searchable.json index 61db8822e..7f70d1ce8 100644 --- a/workloads/settings-remove-add-swap-searchable.json +++ b/workloads/settings-remove-add-swap-searchable.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -61,7 +61,9 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", diff --git a/workloads/settings-typo.json b/workloads/settings-typo.json index 45163bc98..e04135877 100644 --- a/workloads/settings-typo.json +++ b/workloads/settings-typo.json @@ -11,7 +11,7 @@ "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" } }, - "commands": [ + "precommands": [ { "route": "indexes/peoples/settings", "method": "PATCH", @@ -62,14 +62,18 @@ "asset": "150k-people.json" }, "synchronous": "WaitForTask" - }, + } + ], + "commands": [ { "route": "indexes/peoples/settings", "method": "PATCH", "body": { "inline": { "typoTolerance": { - "disableOnAttributes": ["featured_job_organization_name"] + "disableOnAttributes": [ + "featured_job_organization_name" + ] } } }, @@ -93,7 +97,22 @@ "body": { "inline": { "typoTolerance": { - "disableOnWords": ["Ben","Elowitz","Kevin","Flaherty", "Ron", "Dustin", "Owen", "Chris", "Mark", "Matt", "Peter", "Van", "Head", "of"] + "disableOnWords": [ + "Ben", + "Elowitz", + "Kevin", + "Flaherty", + "Ron", + "Dustin", + "Owen", + "Chris", + "Mark", + "Matt", + "Peter", + "Van", + "Head", + "of" + ] } } }, diff --git a/xtask/src/bench/client.rs b/xtask/src/bench/client.rs index 3e46615cc..1c2b743af 100644 --- a/xtask/src/bench/client.rs +++ b/xtask/src/bench/client.rs @@ -55,6 +55,10 @@ impl Client { pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { self.request(reqwest::Method::DELETE, route) } + + pub fn base_url(&self) -> Option<&str> { + self.base_url.as_deref() + } } #[derive(Debug, Clone, Copy, Deserialize)] diff --git a/xtask/src/bench/dashboard.rs b/xtask/src/bench/dashboard.rs index 3ba0ca58b..67353f7bb 100644 --- a/xtask/src/bench/dashboard.rs +++ b/xtask/src/bench/dashboard.rs @@ -18,12 +18,9 @@ pub enum DashboardClient { } impl DashboardClient { - pub fn new(dashboard_url: &str, api_key: Option<&str>) -> anyhow::Result { - let dashboard_client = Client::new( - Some(format!("{}/api/v1", dashboard_url)), - api_key, - Some(std::time::Duration::from_secs(60)), - )?; + pub fn new(dashboard_url: String, api_key: Option<&str>) -> anyhow::Result { + let dashboard_client = + Client::new(Some(dashboard_url), api_key, Some(std::time::Duration::from_secs(60)))?; Ok(Self::Client(dashboard_client)) } @@ -36,7 +33,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(()) }; let response = dashboard_client - .put("machine") + .put("/api/v1/machine") .json(&json!({"hostname": env.hostname})) .send() .await @@ -62,7 +59,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; let response = dashboard_client - .put("invocation") + .put("/api/v1/invocation") .json(&json!({ "commit": { "sha1": build_info.commit_sha1, @@ -97,7 +94,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; let response = dashboard_client - .put("workload") + .put("/api/v1/workload") .json(&json!({ "invocation_uuid": invocation_uuid, "name": &workload.name, @@ -124,7 +121,7 @@ impl DashboardClient { let Self::Client(dashboard_client) = self else { return Ok(()) }; let response = dashboard_client - .put("run") + .put("/api/v1/run") .json(&json!({ "workload_uuid": workload_uuid, "data": report @@ -159,7 +156,7 @@ impl DashboardClient { pub async fn mark_as_failed(&self, invocation_uuid: Uuid, failure_reason: Option) { if let DashboardClient::Client(client) = self { let response = client - .post("cancel-invocation") + .post("/api/v1/cancel-invocation") .json(&json!({ "invocation_uuid": invocation_uuid, "failure_reason": failure_reason, @@ -186,4 +183,28 @@ impl DashboardClient { tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); } + + /// Result URL in markdown + pub(crate) fn result_url( + &self, + workload_name: &str, + build_info: &build_info::BuildInfo, + baseline_branch: &str, + ) -> String { + let Self::Client(client) = self else { return Default::default() }; + let Some(base_url) = client.base_url() else { return Default::default() }; + + let Some(commit_sha1) = build_info.commit_sha1 else { return Default::default() }; + + // https://bench.meilisearch.dev/view_spans?commit_sha1=500ddc76b549fb9f1af54b2dd6abfa15960381bb&workload_name=settings-add-remove-filters.json&target_branch=reduce-transform-disk-usage&baseline_branch=main + let mut url = format!( + "{base_url}/view_spans?commit_sha1={commit_sha1}&workload_name={workload_name}" + ); + + if let Some(target_branch) = build_info.branch { + url += &format!("&target_branch={target_branch}&baseline_branch={baseline_branch}"); + } + + format!("[{workload_name} compared with {baseline_branch}]({url})") + } } diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs index 844b64f63..fdb2c4963 100644 --- a/xtask/src/bench/mod.rs +++ b/xtask/src/bench/mod.rs @@ -6,6 +6,7 @@ mod env_info; mod meili_process; mod workload; +use std::io::LineWriter; use std::path::PathBuf; use anyhow::Context; @@ -90,6 +91,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let subscriber = tracing_subscriber::registry().with( tracing_subscriber::fmt::layer() + .with_writer(|| LineWriter::new(std::io::stderr())) .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) .with_filter(filter), ); @@ -110,7 +112,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let dashboard_client = if args.no_dashboard { dashboard::DashboardClient::new_dry() } else { - dashboard::DashboardClient::new(&args.dashboard_url, args.api_key.as_deref())? + dashboard::DashboardClient::new(args.dashboard_url.clone(), args.api_key.as_deref())? }; // reporting uses its own client because keeping the stream open to wait for entries @@ -136,7 +138,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); - let invocation_uuid = dashboard_client.create_invocation( build_info, commit_message, env, max_workloads, reason).await?; + let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?; tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); @@ -144,6 +146,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let workload_runs = tokio::spawn( { let dashboard_client = dashboard_client.clone(); + let mut dashboard_urls = Vec::new(); async move { for workload_file in args.workload_file.iter() { let workload: Workload = serde_json::from_reader( @@ -152,6 +155,8 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { ) .with_context(|| format!("error parsing {} as JSON", workload_file.display()))?; + let workload_name = workload.name.clone(); + workload::execute( &assets_client, &dashboard_client, @@ -163,8 +168,23 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { &args, ) .await?; + + let result_url = dashboard_client.result_url(&workload_name, &build_info, "main"); + + if !result_url.is_empty() { + dashboard_urls.push(result_url); + } + + if let Some(branch) = build_info.branch { + let result_url = dashboard_client.result_url(&workload_name, &build_info, branch); + + + if !result_url.is_empty() { + dashboard_urls.push(result_url); + } + } } - Ok::<(), anyhow::Error>(()) + Ok::<_, anyhow::Error>(dashboard_urls) }}); // handle ctrl-c @@ -176,13 +196,19 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { // wait for the end of the main task, handle result match workload_runs.await { - Ok(Ok(_)) => { + Ok(Ok(urls)) => { tracing::info!("Success"); + println!("☀️ Benchmark invocation completed, please find the results for your workloads below:"); + for url in urls { + println!("- {url}"); + } Ok::<(), anyhow::Error>(()) } Ok(Err(error)) => { tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); dashboard_client.mark_as_failed(invocation_uuid, Some(error.to_string())).await; + println!("☔️ Benchmark invocation failed..."); + println!("{error}"); tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); Err(error) }, @@ -191,10 +217,20 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { Ok(panic) => { tracing::error!("invocation panicked, attempting to report the failure to dashboard"); dashboard_client.mark_as_failed( invocation_uuid, Some("Panicked".into())).await; + println!("‼️ Benchmark invocation panicked 😱"); + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + println!("panicked at {msg}"); std::panic::resume_unwind(panic) } Err(_) => { tracing::warn!("task was canceled"); + println!("🚫 Benchmark invocation was canceled"); Ok(()) } } diff --git a/xtask/src/bench/workload.rs b/xtask/src/bench/workload.rs index d82c5ad19..db44b5a8f 100644 --- a/xtask/src/bench/workload.rs +++ b/xtask/src/bench/workload.rs @@ -22,6 +22,8 @@ pub struct Workload { pub run_count: u16, pub extra_cli_args: Vec, pub assets: BTreeMap, + #[serde(default)] + pub precommands: Vec, pub commands: Vec, } @@ -37,6 +39,15 @@ async fn run_commands( let report_folder = &args.report_folder; let workload_name = &workload.name; + for batch in workload + .precommands + .as_slice() + .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) + { + super::command::run_batch(meili_client, batch, &workload.assets, &args.asset_folder) + .await?; + } + std::fs::create_dir_all(report_folder) .with_context(|| format!("could not create report directory at {report_folder}"))?;