From c02d585f5b99729172f3fb7c4847bc20b126c7c5 Mon Sep 17 00:00:00 2001 From: Eric Long Date: Mon, 12 Feb 2024 14:10:40 +0800 Subject: [PATCH 01/32] Upgrade rustls to 0.21.10 and ring to 0.17 --- Cargo.lock | 182 +++++++++++++---------------------- meilisearch-types/Cargo.toml | 2 +- meilisearch/Cargo.toml | 12 +-- meilisearch/src/main.rs | 2 +- meilisearch/src/option.rs | 4 +- 5 files changed, 78 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 32ad13772..0fa1d5131 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,16 +36,16 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.5.1" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "129d4c88e98860e1758c5de288d1632b07970a16d59bdf7b8d66053d582bb71f" +checksum = "d223b13fd481fc0d1f83bb12659ae774d9e3601814c68a0bc539731698cca743" dependencies = [ "actix-codec", "actix-rt", "actix-service", "actix-tls", "actix-utils", - "ahash 0.8.3", + "ahash 0.8.8", "base64 0.21.7", "bitflags 2.4.1", "brotli", @@ -138,9 +138,9 @@ dependencies = [ [[package]] name = "actix-tls" -version = "3.1.1" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72616e7fbec0aa99c6f3164677fa48ff5a60036d0799c98cab894a44f3e0efc3" +checksum = "d4cce60a2f2b477bc72e5cde0af1812a6e82d8fd85b5570a5dcf2a5bf2c5be5f" dependencies = [ "actix-rt", "actix-service", @@ -148,13 +148,11 @@ dependencies = [ "futures-core", "impl-more", "pin-project-lite", - "rustls 0.21.6", - "rustls-webpki", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls", "tokio-util", "tracing", - "webpki-roots 0.22.6", + "webpki-roots", ] [[package]] @@ -169,9 +167,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.4.1" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e43428f3bf11dee6d166b00ec2df4e3aa8cc1606aaa0b7433c146852e2f4e03b" +checksum = "43a6556ddebb638c2358714d853257ed226ece6023ef9364f23f0c70737ea984" dependencies = [ "actix-codec", "actix-http", @@ -183,7 +181,7 @@ dependencies = [ "actix-tls", "actix-utils", "actix-web-codegen", - "ahash 0.8.3", + "ahash 0.8.8", "bytes", "bytestring", "cfg-if", @@ -270,14 +268,15 @@ dependencies = [ [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -834,9 +833,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.82" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", "libc", @@ -2126,8 +2125,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -2251,7 +2252,7 @@ dependencies = [ "atomic-polyfill", "hash32", "rustc_version", - "spin 0.9.8", + "spin", "stable_deref_trait", ] @@ -2420,9 +2421,9 @@ dependencies = [ "futures-util", "http 0.2.11", "hyper", - "rustls 0.21.6", + "rustls", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", ] [[package]] @@ -3124,13 +3125,14 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.3.0" +version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.7", + "js-sys", "pem", - "ring 0.16.20", + "ring", "serde", "serde_json", "simple_asn1", @@ -3721,7 +3723,7 @@ dependencies = [ "rayon", "regex", "reqwest", - "rustls 0.20.9", + "rustls", "rustls-pemfile", "segment", "serde", @@ -4257,11 +4259,12 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "1.1.1" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ - "base64 0.13.1", + "base64 0.21.7", + "serde", ] [[package]] @@ -4792,20 +4795,20 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.6", + "rustls", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "system-configuration", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots 0.25.3", + "webpki-roots", "winreg", ] @@ -4823,30 +4826,15 @@ checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1" [[package]] name = "ring" -version = "0.16.20" +version = "0.17.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - -[[package]] -name = "ring" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e" +checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74" dependencies = [ "cc", "getrandom", "libc", - "spin 0.9.8", - "untrusted 0.9.0", + "spin", + "untrusted", "windows-sys 0.48.0", ] @@ -4924,24 +4912,12 @@ dependencies = [ [[package]] name = "rustls" -version = "0.20.9" +version = "0.21.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ "log", - "ring 0.16.20", - "sct", - "webpki", -] - -[[package]] -name = "rustls" -version = "0.21.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1feddffcfcc0b33f5c6ce9a29e341e4cd59c3f78e7ee45f4a40c038b1d6cbb" -dependencies = [ - "log", - "ring 0.16.20", + "ring", "rustls-webpki", "sct", ] @@ -4961,8 +4937,8 @@ version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.3", - "untrusted 0.9.0", + "ring", + "untrusted", ] [[package]] @@ -5004,12 +4980,12 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "sct" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", + "ring", + "untrusted", ] [[package]] @@ -5275,12 +5251,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "spin" version = "0.9.8" @@ -5642,24 +5612,13 @@ dependencies = [ "syn 2.0.48", ] -[[package]] -name = "tokio-rustls" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" -dependencies = [ - "rustls 0.20.9", - "tokio", - "webpki", -] - [[package]] name = "tokio-rustls" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.6", + "rustls", "tokio", ] @@ -5915,12 +5874,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "untrusted" version = "0.9.0" @@ -5937,13 +5890,13 @@ dependencies = [ "flate2", "log", "once_cell", - "rustls 0.21.6", + "rustls", "rustls-webpki", "serde", "serde_json", "socks", "url", - "webpki-roots 0.25.3", + "webpki-roots", ] [[package]] @@ -6153,25 +6106,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.22.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f" -dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", -] - -[[package]] -name = "webpki-roots" -version = "0.22.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" -dependencies = [ - "webpki", -] - [[package]] name = "webpki-roots" version = "0.25.3" @@ -6533,6 +6467,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "zerofrom" version = "0.1.3" diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index f5bfaa036..b9edb4c1e 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -11,7 +11,7 @@ edition.workspace = true license.workspace = true [dependencies] -actix-web = { version = "4.4.1", default-features = false } +actix-web = { version = "4.5.1", default-features = false } anyhow = "1.0.79" convert_case = "0.6.0" csv = "1.3.0" diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 7fbabba87..f8a50238a 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -14,18 +14,18 @@ default-run = "meilisearch" [dependencies] actix-cors = "0.7.0" -actix-http = { version = "3.5.1", default-features = false, features = [ +actix-http = { version = "3.6.0", default-features = false, features = [ "compress-brotli", "compress-gzip", - "rustls", + "rustls-0_21", ] } actix-utils = "3.0.1" -actix-web = { version = "4.4.1", default-features = false, features = [ +actix-web = { version = "4.5.1", default-features = false, features = [ "macros", "compress-brotli", "compress-gzip", "cookies", - "rustls", + "rustls-0_21", ] } actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true } anyhow = { version = "1.0.79", features = ["backtrace"] } @@ -52,7 +52,7 @@ index-scheduler = { path = "../index-scheduler" } indexmap = { version = "2.1.0", features = ["serde"] } is-terminal = "0.4.10" itertools = "0.11.0" -jsonwebtoken = "8.3.0" +jsonwebtoken = "9.2.0" lazy_static = "1.4.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } @@ -75,7 +75,7 @@ reqwest = { version = "0.11.23", features = [ "rustls-tls", "json", ], default-features = false } -rustls = "0.20.8" +rustls = "0.21.6" rustls-pemfile = "1.0.2" segment = { version = "0.2.3", optional = true } serde = { version = "1.0.195", features = ["derive"] } diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index ed18fb97e..1e067b43e 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -133,7 +133,7 @@ async fn run_http( .keep_alive(KeepAlive::Os); if let Some(config) = opt_clone.get_ssl_config()? { - http_server.bind_rustls(opt_clone.http_addr, config)?.run().await?; + http_server.bind_rustls_021(opt_clone.http_addr, config)?.run().await?; } else { http_server.bind(&opt_clone.http_addr)?.run().await?; } diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 9586a3f6f..96bc29006 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -503,11 +503,11 @@ impl Opt { } if self.ssl_require_auth { let verifier = AllowAnyAuthenticatedClient::new(client_auth_roots); - config.with_client_cert_verifier(verifier) + config.with_client_cert_verifier(Arc::from(verifier)) } else { let verifier = AllowAnyAnonymousOrAuthenticatedClient::new(client_auth_roots); - config.with_client_cert_verifier(verifier) + config.with_client_cert_verifier(Arc::from(verifier)) } } None => config.with_no_client_auth(), From 024de0dcf834d034fa4a8372a76139a77fd035b6 Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 14 Feb 2024 17:36:39 +0100 Subject: [PATCH 02/32] Create automation when creating Milestone to create update-version issue --- .github/workflows/milestone-workflow.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/milestone-workflow.yml b/.github/workflows/milestone-workflow.yml index 2b8b7bf62..2ede3dc21 100644 --- a/.github/workflows/milestone-workflow.yml +++ b/.github/workflows/milestone-workflow.yml @@ -110,6 +110,25 @@ jobs: --milestone $MILESTONE_VERSION \ --assignee curquiza + create-update-version-issue: + needs: get-release-version + # Create the changelog issue if the release is not only a patch release + if: github.event.action == 'created' + runs-on: ubuntu-latest + env: + ISSUE_TEMPLATE: issue-template.md + steps: + - uses: actions/checkout@v3 + - name: Download the issue template + run: curl -s https://raw.githubusercontent.com/meilisearch/engine-team/main/issue-templates/update-version-issue.md > $ISSUE_TEMPLATE + - name: Create the issue + run: | + gh issue create \ + --title "Update version in Cargo.toml for $MILESTONE_VERSION" \ + --label 'maintenance' \ + --body-file $ISSUE_TEMPLATE \ + --milestone $MILESTONE_VERSION + # ---------------- # MILESTONE CLOSED # ---------------- From 9ee4f55e6c72116446438dbc03e6cf20c12f9081 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Sep 2023 11:16:51 +0200 Subject: [PATCH 03/32] let you specify your task id --- index-scheduler/src/error.rs | 4 + index-scheduler/src/lib.rs | 815 ++++++++++++-------- meilisearch/src/lib.rs | 2 +- meilisearch/src/routes/dump.rs | 5 +- meilisearch/src/routes/indexes/documents.rs | 25 +- meilisearch/src/routes/indexes/mod.rs | 12 +- meilisearch/src/routes/indexes/settings.rs | 18 +- meilisearch/src/routes/mod.rs | 30 +- meilisearch/src/routes/snapshot.rs | 5 +- meilisearch/src/routes/swap_indexes.rs | 5 +- meilisearch/src/routes/tasks.rs | 9 +- meilisearch/tests/index/create_index.rs | 71 ++ 12 files changed, 655 insertions(+), 346 deletions(-) diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index bbe526460..223b84762 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -48,6 +48,8 @@ impl From for Code { pub enum Error { #[error("{1}")] WithCustomErrorCode(Code, Box), + #[error("Received bad task id: {received} should be >= to {expected}.")] + BadTaskId { received: TaskId, expected: TaskId }, #[error("Index `{0}` not found.")] IndexNotFound(String), #[error("Index `{0}` already exists.")] @@ -161,6 +163,7 @@ impl Error { match self { Error::IndexNotFound(_) | Error::WithCustomErrorCode(_, _) + | Error::BadTaskId { .. } | Error::IndexAlreadyExists(_) | Error::SwapDuplicateIndexFound(_) | Error::SwapDuplicateIndexesFound(_) @@ -205,6 +208,7 @@ impl ErrorCode for Error { fn error_code(&self) -> Code { match self { Error::WithCustomErrorCode(code, _) => *code, + Error::BadTaskId { .. } => Code::BadRequest, Error::IndexNotFound(_) => Code::IndexNotFound, Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists, Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 7514a2a68..b1edaabe5 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -993,7 +993,7 @@ impl IndexScheduler { /// Register a new task in the scheduler. /// /// If it fails and data was associated with the task, it tries to delete the associated data. - pub fn register(&self, kind: KindWithContent) -> Result { + pub fn register(&self, kind: KindWithContent, task_id: Option) -> Result { let mut wtxn = self.env.write_txn()?; // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task @@ -1003,8 +1003,16 @@ impl IndexScheduler { return Err(Error::NoSpaceLeftInTaskQueue); } + let next_task_id = self.next_task_id(&wtxn)?; + + if let Some(uid) = task_id { + if uid < next_task_id { + return Err(Error::BadTaskId { received: uid, expected: next_task_id }); + } + } + let mut task = Task { - uid: self.next_task_id(&wtxn)?, + uid: task_id.unwrap_or(next_task_id), enqueued_at: OffsetDateTime::now_utc(), started_at: None, finished_at: None, @@ -1386,13 +1394,16 @@ impl IndexScheduler { // increase time by one nanosecond so that the enqueuedAt of the last task to delete is also lower than that date. let delete_before = last_task_to_delete.enqueued_at + Duration::from_nanos(1); - self.register(KindWithContent::TaskDeletion { - query: format!( - "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", - delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, - ), - tasks: to_delete, - })?; + self.register( + KindWithContent::TaskDeletion { + query: format!( + "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", + delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, + ), + tasks: to_delete, + }, + None, + )?; Ok(()) } @@ -2016,7 +2027,7 @@ mod tests { for (idx, kind) in kinds.into_iter().enumerate() { let k = kind.as_kind(); - let task = index_scheduler.register(kind).unwrap(); + let task = index_scheduler.register(kind, None).unwrap(); index_scheduler.assert_internally_consistent(); assert_eq!(task.uid, idx as u32); @@ -2031,18 +2042,18 @@ mod tests { fn insert_task_while_another_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); // while the task is processing can we register another task? - index_scheduler.register(index_creation_task("index_b", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_b", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }) + .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); } @@ -2051,7 +2062,7 @@ mod tests { fn test_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); handle.advance_till([Start, BatchCreated]); @@ -2065,17 +2076,23 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2094,22 +2111,25 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); @@ -2142,7 +2162,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2151,10 +2171,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1]), + }, + None, + ) .unwrap(); // again, no progress made at all, but one more task is registered snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_enqueued"); @@ -2188,7 +2211,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2199,10 +2222,13 @@ mod tests { // Now we delete the first task index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); @@ -2225,7 +2251,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2237,10 +2263,13 @@ mod tests { // Now we delete the first task multiple times in a row for _ in 0..2 { index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2263,14 +2292,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); @@ -2292,7 +2324,10 @@ mod tests { }"#; index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2300,19 +2335,22 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2336,21 +2374,27 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }) + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2373,10 +2417,13 @@ mod tests { fn document_deletion_and_document_addition() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); index_scheduler - .register(KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }) + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2390,14 +2437,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2428,17 +2478,20 @@ mod tests { for name in index_names { index_scheduler - .register(KindWithContent::IndexCreation { - index_uid: name.to_string(), - primary_key: None, - }) + .register( + KindWithContent::IndexCreation { + index_uid: name.to_string(), + primary_key: None, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } for name in index_names { index_scheduler - .register(KindWithContent::DocumentClear { index_uid: name.to_string() }) + .register(KindWithContent::DocumentClear { index_uid: name.to_string() }, None) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2463,7 +2516,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2477,18 +2530,24 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_d"); index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, + ], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); @@ -2498,7 +2557,7 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); - index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }).unwrap(); + index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }, None).unwrap(); handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); } @@ -2515,7 +2574,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_n_successful_batches(4); @@ -2525,12 +2584,15 @@ mod tests { snapshot!(first_snap, name: "initial_tasks_processed"); let err = index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, + ], + }, + None, + ) .unwrap_err(); snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); @@ -2539,13 +2601,16 @@ mod tests { // Index `e` does not exist, but we don't check its existence yet index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, - IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, + IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, + ], + }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); // Now the first swap should have an error message saying `e` and `f` do not exist @@ -2566,17 +2631,20 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler)); @@ -2601,7 +2669,7 @@ mod tests { }, ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2618,7 +2686,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0)) + .register(replace_document_import_task("catto", None, 0, documents_count0), None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2626,10 +2694,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processed"); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); @@ -2644,7 +2715,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0)) + .register(replace_document_import_task("catto", None, 0, documents_count0), None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2652,10 +2723,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processing"); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_task_registered"); @@ -2685,7 +2759,7 @@ mod tests { replace_document_import_task("wolfo", None, 2, documents_count2), ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_one_successful_batch(); @@ -2693,10 +2767,13 @@ mod tests { handle.advance_till([Start, BatchCreated, InsideProcessBatch]); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1, 2]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1, 2]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); @@ -2724,14 +2801,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2771,14 +2851,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2820,14 +2903,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2870,14 +2956,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2921,14 +3010,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2973,13 +3065,13 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); handle.advance_n_successful_batches(3); @@ -3037,11 +3129,11 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3260,17 +3352,17 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3346,20 +3438,20 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind).unwrap(); + let _ = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind).unwrap(); + let _ = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); handle.advance_n_successful_batches(1); let kind = KindWithContent::TaskCancelation { query: "test_query".to_string(), tasks: [0, 1, 2, 3].into_iter().collect(), }; - let task_cancelation = index_scheduler.register(kind).unwrap(); + let task_cancelation = index_scheduler.register(kind, None).unwrap(); handle.advance_n_successful_batches(1); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3394,7 +3486,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); handle.advance_one_failed_batch(); @@ -3419,14 +3511,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); @@ -3457,14 +3552,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3513,14 +3611,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3561,14 +3662,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3596,7 +3700,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3615,14 +3722,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3655,7 +3765,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3674,14 +3787,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3718,7 +3834,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3738,14 +3857,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3791,14 +3913,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3843,14 +3968,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3904,14 +4032,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3961,14 +4092,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4042,14 +4176,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4125,14 +4262,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4186,7 +4326,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); @@ -4206,15 +4346,18 @@ mod tests { }); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); // on average this task takes ~600 bytes loop { - let result = index_scheduler.register(KindWithContent::IndexCreation { - index_uid: S("doggo"), - primary_key: None, - }); + let result = index_scheduler.register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ); if result.is_err() { break; } @@ -4224,7 +4367,10 @@ mod tests { // at this point the task DB shoud have reached its limit and we should not be able to register new tasks let result = index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code @@ -4232,10 +4378,10 @@ mod tests { // Even the task deletion that doesn't delete anything shouldn't be accepted let result = index_scheduler - .register(KindWithContent::TaskDeletion { - query: S("test"), - tasks: RoaringBitmap::new(), - }) + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, + None, + ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code @@ -4243,13 +4389,19 @@ mod tests { // But a task deletion that delete something should works index_scheduler - .register(KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }) + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); // Now we should be able to enqueue a few tasks again index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); } @@ -4262,22 +4414,34 @@ mod tests { }); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); // at this point the max number of tasks is reached // we can still enqueue multiple tasks index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); let rtxn = index_scheduler.env.read_txn().unwrap(); @@ -4325,11 +4489,11 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" { @@ -4479,11 +4643,11 @@ mod tests { query: "cancel dump".to_owned(), tasks: RoaringBitmap::from_iter([0]), }; - let _ = index_scheduler.register(dump_creation).unwrap(); + let _ = index_scheduler.register(dump_creation, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - let _ = index_scheduler.register(dump_cancellation).unwrap(); + let _ = index_scheduler.register(dump_cancellation, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); @@ -4491,4 +4655,21 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); } + + #[test] + fn basic_set_taskid() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None).unwrap(); + snapshot!(task.uid, @"0"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12)).unwrap(); + snapshot!(task.uid, @"12"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let error = index_scheduler.register(kind, Some(5)).unwrap_err(); + snapshot!(error, @"Received bad task id: 5 should be >= to 13."); + } } diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index c43a32cdc..328b9e9b2 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -251,7 +251,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< .name(String::from("register-snapshot-tasks")) .spawn(move || loop { thread::sleep(snapshot_delay); - if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation) { + if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation, None) { error!("Error while registering snapshot: {}", e); } }) diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 071ae60b8..8f44070d8 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -11,7 +11,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); @@ -29,8 +29,9 @@ pub async fn create_dump( keys: auth_controller.list_keys()?, instance_uid: analytics.instance_uid().cloned(), }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create dump"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 1f41fa10c..492f039cf 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -7,7 +7,7 @@ use bstr::ByteSlice as _; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::Deserr; use futures::StreamExt; -use index_scheduler::IndexScheduler; +use index_scheduler::{IndexScheduler, TaskId}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType}; @@ -36,7 +36,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::search::parse_filter; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -130,9 +130,10 @@ pub async fn delete_document( index_uid: index_uid.to_string(), documents_ids: vec![document_id], }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); - debug!(returns = ?task, "Delete document"); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } @@ -277,6 +278,7 @@ pub async fn replace_documents( analytics.add_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); + let uid = get_task_id(&req)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -285,6 +287,7 @@ pub async fn replace_documents( params.csv_delimiter, body, IndexDocumentsMethod::ReplaceDocuments, + uid, allow_index_creation, ) .await?; @@ -309,6 +312,7 @@ pub async fn update_documents( analytics.update_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); + let uid = get_task_id(&req)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -317,6 +321,7 @@ pub async fn update_documents( params.csv_delimiter, body, IndexDocumentsMethod::UpdateDocuments, + uid, allow_index_creation, ) .await?; @@ -334,6 +339,7 @@ async fn document_addition( csv_delimiter: Option, mut body: Payload, method: IndexDocumentsMethod, + task_id: Option, allow_index_creation: bool, ) -> Result { let format = match ( @@ -450,7 +456,7 @@ async fn document_addition( }; let scheduler = index_scheduler.clone(); - let task = match tokio::task::spawn_blocking(move || scheduler.register(task)).await? { + let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id)).await? { Ok(task) => task, Err(e) => { index_scheduler.delete_update_file(uuid)?; @@ -480,8 +486,9 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete documents by batch"); Ok(HttpResponse::Accepted().json(task)) @@ -516,8 +523,9 @@ pub async fn delete_documents_by_filter( .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?; let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete documents by filter"); Ok(HttpResponse::Accepted().json(task)) @@ -533,8 +541,9 @@ pub async fn clear_all_documents( analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete all documents"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index d80bd9c61..6451d930d 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -17,7 +17,7 @@ use serde_json::json; use time::OffsetDateTime; use tracing::debug; -use super::{Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -137,8 +137,9 @@ pub async fn create_index( ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create index"); Ok(HttpResponse::Accepted().json(task)) @@ -206,8 +207,9 @@ pub async fn update_index( primary_key: body.primary_key, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Update index"); Ok(HttpResponse::Accepted().json(task)) @@ -216,11 +218,13 @@ pub async fn update_index( pub async fn delete_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete index"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 23e8925c7..9fbd84161 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -15,7 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; #[macro_export] macro_rules! make_setting_route { @@ -34,7 +34,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::policies::*; use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; - use $crate::routes::SummarizedTaskView; + use $crate::routes::{get_task_id, SummarizedTaskView}; pub async fn delete( index_scheduler: GuardedData< @@ -42,6 +42,7 @@ macro_rules! make_setting_route { Data, >, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -56,8 +57,9 @@ macro_rules! make_setting_route { is_deletion: true, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? .into(); @@ -105,8 +107,9 @@ macro_rules! make_setting_route { is_deletion: false, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? .into(); @@ -767,8 +770,9 @@ pub async fn update_all( is_deletion: false, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Update all settings"); Ok(HttpResponse::Accepted().json(task)) @@ -790,6 +794,7 @@ pub async fn get_all( pub async fn delete_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -803,8 +808,9 @@ pub async fn delete_all( is_deletion: true, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete all settings"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 89cf63c50..61a9f3352 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -4,7 +4,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; -use meilisearch_types::error::ResponseError; +use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; use serde::{Deserialize, Serialize}; @@ -45,6 +45,34 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/experimental-features").configure(features::configure)); } +pub fn get_task_id(req: &HttpRequest) -> Result, ResponseError> { + let task_id = req + .headers() + .get("TaskId") + .map(|header| { + header.to_str().map_err(|e| { + ResponseError::from_msg( + format!("TaskId is not a valid utf-8 string: {e}"), + Code::BadRequest, + ) + }) + }) + .transpose()? + .map(|s| { + s.parse::().map_err(|e| { + ResponseError::from_msg( + format!( + "Could not parse the TaskId as a {}: {e}", + std::any::type_name::(), + ), + Code::BadRequest, + ) + }) + }) + .transpose()?; + Ok(task_id) +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct SummarizedTaskView { diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index c94529932..28dbac85f 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -10,7 +10,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); @@ -24,8 +24,9 @@ pub async fn create_snapshot( analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); let task = KindWithContent::SnapshotCreation; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create snapshot"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 79e619705..64268dbfa 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -10,7 +10,7 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde_json::json; -use super::SummarizedTaskView; +use super::{get_task_id, SummarizedTaskView}; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -60,7 +60,8 @@ pub async fn swap_indexes( } let task = KindWithContent::IndexSwap { swaps }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); Ok(HttpResponse::Accepted().json(task)) } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 03b63001d..26e1c43f8 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -18,7 +18,7 @@ use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; -use super::SummarizedTaskView; +use super::{get_task_id, SummarizedTaskView}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; @@ -197,7 +197,9 @@ async fn cancel_tasks( let task_cancelation = KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; - let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation)).await??; + let uid = get_task_id(&req)?; + let task = + task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) @@ -242,7 +244,8 @@ async fn delete_tasks( let task_deletion = KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; - let task = task::spawn_blocking(move || index_scheduler.register(task_deletion)).await??; + let uid = get_task_id(&req)?; + let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index 7ce56d440..b9f755f35 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -199,3 +199,74 @@ async fn error_create_with_invalid_index_uid() { } "###); } + +#[actix_rt::test] +async fn send_task_id() { + let server = Server::new().await; + let app = server.init_web_app().await; + let index = server.index("catto"); + let (response, code) = index.create(None).await; + snapshot!(code, @"202 Accepted"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 0, + "indexUid": "catto", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "[date]" + } + "###); + + let body = serde_json::to_string(&json!({ + "uid": "doggo", + "primaryKey": None::<&str>, + })) + .unwrap(); + let req = test::TestRequest::post() + .uri("/indexes") + .insert_header(("TaskId", "25")) + .insert_header(ContentType::json()) + .set_payload(body) + .to_request(); + + let res = test::call_service(&app, req).await; + snapshot!(res.status(), @"202 Accepted"); + + let bytes = test::read_body(res).await; + let response = serde_json::from_slice::(&bytes).expect("Expecting valid json"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 25, + "indexUid": "doggo", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "[date]" + } + "###); + + let body = serde_json::to_string(&json!({ + "uid": "girafo", + "primaryKey": None::<&str>, + })) + .unwrap(); + let req = test::TestRequest::post() + .uri("/indexes") + .insert_header(("TaskId", "12")) + .insert_header(ContentType::json()) + .set_payload(body) + .to_request(); + + let res = test::call_service(&app, req).await; + snapshot!(res.status(), @"400 Bad Request"); + + let bytes = test::read_body(res).await; + let response = serde_json::from_slice::(&bytes).expect("Expecting valid json"); + snapshot!(json_string!(response), @r###" + { + "message": "Received bad task id: 12 should be >= to 26.", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); +} From 01ae46dd801a2fbe43351660acc65e3467747006 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 20 Feb 2024 11:24:44 +0100 Subject: [PATCH 04/32] add an experimental cli parameter to allow specifying your task id --- .../src/analytics/segment_analytics.rs | 3 +++ .../src/extractors/sequential_extractor.rs | 1 + meilisearch/src/lib.rs | 1 + meilisearch/src/option.rs | 17 +++++++++++++++++ meilisearch/src/routes/dump.rs | 4 +++- meilisearch/src/routes/indexes/documents.rs | 19 +++++++++++++------ meilisearch/src/routes/indexes/mod.rs | 10 +++++++--- meilisearch/src/routes/indexes/settings.rs | 14 ++++++++++---- meilisearch/src/routes/mod.rs | 6 +++++- meilisearch/src/routes/snapshot.rs | 4 +++- meilisearch/src/routes/swap_indexes.rs | 4 +++- meilisearch/src/routes/tasks.rs | 7 +++++-- meilisearch/tests/index/create_index.rs | 9 +++++++-- 13 files changed, 78 insertions(+), 21 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index a38ddaab2..a78b0d11b 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -250,6 +250,7 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_enable_metrics: bool, + experimental_ha_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, @@ -288,6 +289,7 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, + experimental_ha_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, @@ -335,6 +337,7 @@ impl From for Infos { Self { env, experimental_enable_metrics, + experimental_ha_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, db_path: db_path != PathBuf::from("./data.ms"), diff --git a/meilisearch/src/extractors/sequential_extractor.rs b/meilisearch/src/extractors/sequential_extractor.rs index c04210616..23d6cb997 100644 --- a/meilisearch/src/extractors/sequential_extractor.rs +++ b/meilisearch/src/extractors/sequential_extractor.rs @@ -131,6 +131,7 @@ gen_seq! { SeqFromRequestFut3; A B C } gen_seq! { SeqFromRequestFut4; A B C D } gen_seq! { SeqFromRequestFut5; A B C D E } gen_seq! { SeqFromRequestFut6; A B C D E F } +gen_seq! { SeqFromRequestFut7; A B C D E F G } pin_project! { #[project = ExtractProj] diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 328b9e9b2..2d9dec485 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -453,6 +453,7 @@ pub fn configure_data( .app_data(auth) .app_data(web::Data::from(analytics)) .app_data(web::Data::new(logs)) + .app_data(web::Data::new(opt.clone())) .app_data( web::JsonConfig::default() .limit(http_payload_size_limit) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 9586a3f6f..4dd17d546 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -51,6 +51,7 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP"; const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; +const MEILI_EXPERIMENTAL_HA_PARAMETERS: &str = "MEILI_EXPERIMENTAL_HA_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = @@ -317,6 +318,17 @@ pub struct Opt { #[serde(default)] pub experimental_enable_logs_route: bool, + /// Enable multiple features that helps you to run meilisearch in a high availability context. + /// TODO: TAMO: Update the discussion link + /// For more information, see: + /// + /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now + /// - Lets you specify a custom task ID upon registering a task + /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed) + #[clap(long, env = MEILI_EXPERIMENTAL_HA_PARAMETERS)] + #[serde(default)] + pub experimental_ha_parameters: bool, + /// Experimental RAM reduction during indexing, do not use in production, see: #[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)] #[serde(default)] @@ -423,6 +435,7 @@ impl Opt { no_analytics, experimental_enable_metrics, experimental_enable_logs_route, + experimental_ha_parameters, experimental_reduce_indexing_memory_usage, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); @@ -479,6 +492,10 @@ impl Opt { MEILI_EXPERIMENTAL_ENABLE_METRICS, experimental_enable_metrics.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_HA_PARAMETERS, + experimental_ha_parameters.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE, experimental_enable_logs_route.to_string(), diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 8f44070d8..56231a759 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -12,6 +12,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); @@ -21,6 +22,7 @@ pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { analytics.publish("Dump Created".to_string(), json!({}), Some(&req)); @@ -29,7 +31,7 @@ pub async fn create_dump( keys: auth_controller.list_keys()?, instance_uid: analytics.instance_uid().cloned(), }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 492f039cf..5bf7eaa8d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -38,6 +38,7 @@ use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::search::parse_filter; +use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -119,6 +120,7 @@ pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = path.into_inner(); @@ -130,7 +132,7 @@ pub async fn delete_document( index_uid: index_uid.to_string(), documents_ids: vec![document_id], }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!("returns: {:?}", task); @@ -268,6 +270,7 @@ pub async fn replace_documents( params: AwebQueryParameter, body: Payload, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -278,7 +281,7 @@ pub async fn replace_documents( analytics.add_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -302,6 +305,7 @@ pub async fn update_documents( params: AwebQueryParameter, body: Payload, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -312,7 +316,7 @@ pub async fn update_documents( analytics.update_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -472,6 +476,7 @@ pub async fn delete_documents_batch( index_uid: web::Path, body: web::Json>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by batch"); @@ -486,7 +491,7 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -506,6 +511,7 @@ pub async fn delete_documents_by_filter( index_uid: web::Path, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by filter"); @@ -523,7 +529,7 @@ pub async fn delete_documents_by_filter( .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?; let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -535,13 +541,14 @@ pub async fn clear_all_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 6451d930d..59a1f0e64 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -22,6 +22,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; pub mod documents; pub mod facet_search; @@ -123,6 +124,7 @@ pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Create index"); @@ -137,7 +139,7 @@ pub async fn create_index( ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create index"); @@ -191,6 +193,7 @@ pub async fn update_index( index_uid: web::Path, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Update index"); @@ -207,7 +210,7 @@ pub async fn update_index( primary_key: body.primary_key, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -219,10 +222,11 @@ pub async fn delete_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete index"); diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 9fbd84161..6e43bce41 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -16,6 +16,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; #[macro_export] macro_rules! make_setting_route { @@ -34,6 +35,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::policies::*; use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; + use $crate::Opt; use $crate::routes::{get_task_id, SummarizedTaskView}; pub async fn delete( @@ -43,6 +45,7 @@ macro_rules! make_setting_route { >, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -57,7 +60,7 @@ macro_rules! make_setting_route { is_deletion: true, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? @@ -75,6 +78,7 @@ macro_rules! make_setting_route { index_uid: actix_web::web::Path, body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, + opt: web::Data, $analytics_var: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -107,7 +111,7 @@ macro_rules! make_setting_route { is_deletion: false, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? @@ -655,6 +659,7 @@ pub async fn update_all( index_uid: web::Path, body: AwebJson, DeserrJsonError>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -770,7 +775,7 @@ pub async fn update_all( is_deletion: false, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -795,6 +800,7 @@ pub async fn delete_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -808,7 +814,7 @@ pub async fn delete_all( is_deletion: true, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 61a9f3352..2dc89b150 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -15,6 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; +use crate::Opt; const PAGINATION_DEFAULT_LIMIT: usize = 20; @@ -45,7 +46,10 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/experimental-features").configure(features::configure)); } -pub fn get_task_id(req: &HttpRequest) -> Result, ResponseError> { +pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, ResponseError> { + if !opt.experimental_ha_parameters { + return Ok(None); + } let task_id = req .headers() .get("TaskId") diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index 28dbac85f..6b3178126 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -11,6 +11,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); @@ -19,12 +20,13 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); let task = KindWithContent::SnapshotCreation; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 64268dbfa..f8adeeb18 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -16,6 +16,7 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(swap_indexes)))); @@ -32,6 +33,7 @@ pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -60,7 +62,7 @@ pub async fn swap_indexes( } let task = KindWithContent::IndexSwap { swaps }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 26e1c43f8..279b57e3d 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -23,6 +23,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; const DEFAULT_LIMIT: u32 = 20; @@ -161,6 +162,7 @@ async fn cancel_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -197,7 +199,7 @@ async fn cancel_tasks( let task_cancelation = KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; let task: SummarizedTaskView = task.into(); @@ -209,6 +211,7 @@ async fn delete_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -244,7 +247,7 @@ async fn delete_tasks( let task_deletion = KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; let task: SummarizedTaskView = task.into(); diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index b9f755f35..7a678624c 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -2,9 +2,10 @@ use actix_web::http::header::ContentType; use actix_web::test; use http::header::ACCEPT_ENCODING; use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; use crate::common::encoder::Encoder; -use crate::common::{Server, Value}; +use crate::common::{default_settings, Server, Value}; use crate::json; #[actix_rt::test] @@ -202,7 +203,11 @@ async fn error_create_with_invalid_index_uid() { #[actix_rt::test] async fn send_task_id() { - let server = Server::new().await; + let temp = tempfile::tempdir().unwrap(); + + let options = Opt { experimental_ha_parameters: true, ..default_settings(temp.path()) }; + let server = Server::new_with_options(options).await.unwrap(); + let app = server.init_web_app().await; let index = server.index("catto"); let (response, code) = index.create(None).await; From 6ba999491693fe6ca94376a421817707fc8e66c3 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 20 Feb 2024 12:16:50 +0100 Subject: [PATCH 05/32] disable the auto deletion of tasks when the ha mode is enabled --- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 68 +++++++++++++- .../task_deletion_have_not_been_enqueued.snap | 90 +++++++++++++++++++ .../task_queue_is_full.snap | 90 +++++++++++++++++++ meilisearch/src/lib.rs | 1 + 5 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 42f041578..988e75b81 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let IndexScheduler { autobatching_enabled, + cleanup_enabled: _, must_stop_processing: _, processing_tasks, file_store, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b1edaabe5..9a1799469 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -264,6 +264,9 @@ pub struct IndexSchedulerOptions { /// Set to `true` iff the index scheduler is allowed to automatically /// batch tasks together, to process multiple tasks at once. pub autobatching_enabled: bool, + /// Set to `true` iff the index scheduler is allowed to automatically + /// delete the finished tasks when there are too many tasks. + pub cleanup_enabled: bool, /// The maximum number of tasks stored in the task queue before starting /// to auto schedule task deletions. pub max_number_of_tasks: usize, @@ -324,6 +327,9 @@ pub struct IndexScheduler { /// Whether auto-batching is enabled or not. pub(crate) autobatching_enabled: bool, + /// Whether we should automatically cleanup the task queue or not. + pub(crate) cleanup_enabled: bool, + /// The max number of tasks allowed before the scheduler starts to delete /// the finished tasks automatically. pub(crate) max_number_of_tasks: usize, @@ -390,6 +396,7 @@ impl IndexScheduler { index_mapper: self.index_mapper.clone(), wake_up: self.wake_up.clone(), autobatching_enabled: self.autobatching_enabled, + cleanup_enabled: self.cleanup_enabled, max_number_of_tasks: self.max_number_of_tasks, max_number_of_batched_tasks: self.max_number_of_batched_tasks, puffin_frame: self.puffin_frame.clone(), @@ -491,6 +498,7 @@ impl IndexScheduler { wake_up: Arc::new(SignalEvent::auto(true)), puffin_frame: Arc::new(puffin::GlobalFrameView::default()), autobatching_enabled: options.autobatching_enabled, + cleanup_enabled: options.cleanup_enabled, max_number_of_tasks: options.max_number_of_tasks, max_number_of_batched_tasks: options.max_number_of_batched_tasks, dumps_path: options.dumps_path, @@ -1134,7 +1142,9 @@ impl IndexScheduler { self.breakpoint(Breakpoint::Start); } - self.cleanup_task_queue()?; + if self.cleanup_enabled { + self.cleanup_task_queue()?; + } let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; let batch = @@ -1781,6 +1791,7 @@ mod tests { index_count: 5, indexer_config, autobatching_enabled: true, + cleanup_enabled: true, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: usize::MAX, instance_features: Default::default(), @@ -4484,6 +4495,61 @@ mod tests { drop(rtxn); } + #[test] + fn test_disable_auto_deletion_of_tasks() { + let (index_scheduler, mut handle) = + IndexScheduler::test_with_custom_config(vec![], |config| { + config.cleanup_enabled = false; + config.max_number_of_tasks = 2; + }); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + handle.advance_one_failed_batch(); + + // at this point the max number of tasks is reached + // we can still enqueue multiple tasks + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); + let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); + drop(rtxn); + + // now we're above the max number of tasks + // and if we try to advance in the tick function no new task deletion should be enqueued + handle.advance_till([Start, BatchCreated]); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); + let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_not_been_enqueued"); + drop(rtxn); + } + #[test] fn basic_get_stats() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); diff --git a/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap new file mode 100644 index 000000000..988df76ec --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap @@ -0,0 +1,90 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "uid": 0, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "succeeded", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 1, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": { + "message": "Index `doggo` already exists.", + "code": "index_already_exists", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_already_exists" + }, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "failed", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 2, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 3, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + } +] diff --git a/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap new file mode 100644 index 000000000..988df76ec --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap @@ -0,0 +1,90 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "uid": 0, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "succeeded", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 1, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": { + "message": "Index `doggo` already exists.", + "code": "index_already_exists", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_already_exists" + }, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "failed", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 2, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 3, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + } +] diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 2d9dec485..500d56079 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -286,6 +286,7 @@ fn open_or_create_database_unchecked( enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, indexer_config: (&opt.indexer_options).try_into()?, autobatching_enabled: true, + cleanup_enabled: !opt.experimental_ha_parameters, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, From 05ae29198970d9265b26f6f7232e8611708ca2d6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 11:21:26 +0100 Subject: [PATCH 06/32] implement the dry run ha parameter --- file-store/src/lib.rs | 22 +- index-scheduler/src/lib.rs | 249 ++++++++++++++++---- meilisearch/src/lib.rs | 4 +- meilisearch/src/routes/dump.rs | 7 +- meilisearch/src/routes/indexes/documents.rs | 35 ++- meilisearch/src/routes/indexes/mod.rs | 16 +- meilisearch/src/routes/indexes/settings.rs | 20 +- meilisearch/src/routes/mod.rs | 19 ++ meilisearch/src/routes/snapshot.rs | 7 +- meilisearch/src/routes/swap_indexes.rs | 7 +- meilisearch/src/routes/tasks.rs | 10 +- 11 files changed, 317 insertions(+), 79 deletions(-) diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index 75db9bb5f..e3851a2df 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -56,7 +56,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::new_v4(); let path = self.path.join(uuid.to_string()); - let update_file = File { file, path }; + let update_file = File { dry: false, file, path }; Ok((uuid, update_file)) } @@ -67,7 +67,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::from_u128(uuid); let path = self.path.join(uuid.to_string()); - let update_file = File { file, path }; + let update_file = File { dry: false, file, path }; Ok((uuid, update_file)) } @@ -135,13 +135,29 @@ impl FileStore { } pub struct File { + dry: bool, path: PathBuf, file: NamedTempFile, } impl File { + pub fn dry_file() -> Result { + #[cfg(target_family = "unix")] + let path = PathBuf::from_str("/dev/null").unwrap(); + #[cfg(target_family = "windows")] + let path = PathBuf::from_str("\\Device\\Null").unwrap(); + + Ok(Self { + dry: true, + path: path.clone(), + file: tempfile::Builder::new().make(|_| std::fs::File::create(path.clone()))?, + }) + } + pub fn persist(self) -> Result<()> { - self.file.persist(&self.path)?; + if !self.dry { + self.file.persist(&self.path)?; + } Ok(()) } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 9a1799469..5d0ce9eb9 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1001,7 +1001,12 @@ impl IndexScheduler { /// Register a new task in the scheduler. /// /// If it fails and data was associated with the task, it tries to delete the associated data. - pub fn register(&self, kind: KindWithContent, task_id: Option) -> Result { + pub fn register( + &self, + kind: KindWithContent, + task_id: Option, + dry_run: bool, + ) -> Result { let mut wtxn = self.env.write_txn()?; // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task @@ -1037,6 +1042,11 @@ impl IndexScheduler { // (that it does not contain duplicate indexes). check_index_swap_validity(&task)?; + // At this point the task is going to be registered and no further checks will be done + if dry_run { + return Ok(task); + } + // Get rid of the mutability. let task = task; @@ -1101,8 +1111,12 @@ impl IndexScheduler { /// The returned file and uuid can be used to associate /// some data to a task. The file will be kept until /// the task has been fully processed. - pub fn create_update_file(&self) -> Result<(Uuid, file_store::File)> { - Ok(self.file_store.new_update()?) + pub fn create_update_file(&self, dry_run: bool) -> Result<(Uuid, file_store::File)> { + if dry_run { + Ok((Uuid::nil(), file_store::File::dry_file()?)) + } else { + Ok(self.file_store.new_update()?) + } } #[cfg(test)] @@ -1413,6 +1427,7 @@ impl IndexScheduler { tasks: to_delete, }, None, + false, )?; Ok(()) @@ -1534,7 +1549,7 @@ impl<'a> Dump<'a> { ) -> Result { let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { - let (uuid, mut file) = self.index_scheduler.create_update_file()?; + let (uuid, mut file) = self.index_scheduler.create_update_file(false)?; let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); for doc in content_file { builder.append_json_object(&doc?)?; @@ -2038,7 +2053,7 @@ mod tests { for (idx, kind) in kinds.into_iter().enumerate() { let k = kind.as_kind(); - let task = index_scheduler.register(kind, None).unwrap(); + let task = index_scheduler.register(kind, None, false).unwrap(); index_scheduler.assert_internally_consistent(); assert_eq!(task.uid, idx as u32); @@ -2053,18 +2068,18 @@ mod tests { fn insert_task_while_another_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); // while the task is processing can we register another task? - index_scheduler.register(index_creation_task("index_b", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_b", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); } @@ -2073,7 +2088,7 @@ mod tests { fn test_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); handle.advance_till([Start, BatchCreated]); @@ -2090,6 +2105,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2098,12 +2114,13 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2125,22 +2142,23 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); @@ -2173,7 +2191,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2188,6 +2206,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0, 1]), }, None, + false, ) .unwrap(); // again, no progress made at all, but one more task is registered @@ -2222,7 +2241,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2239,6 +2258,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); @@ -2262,7 +2282,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2280,6 +2300,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2313,6 +2334,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); @@ -2338,6 +2360,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2356,12 +2379,13 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2395,6 +2419,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2405,6 +2430,7 @@ mod tests { documents_ids: vec![S("1"), S("2")], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2434,6 +2460,7 @@ mod tests { documents_ids: vec![S("1"), S("2")], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2458,6 +2485,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2495,6 +2523,7 @@ mod tests { primary_key: None, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2502,7 +2531,11 @@ mod tests { for name in index_names { index_scheduler - .register(KindWithContent::DocumentClear { index_uid: name.to_string() }, None) + .register( + KindWithContent::DocumentClear { index_uid: name.to_string() }, + None, + false, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2527,7 +2560,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2549,6 +2582,7 @@ mod tests { ], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); @@ -2558,6 +2592,7 @@ mod tests { swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); @@ -2568,7 +2603,9 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); - index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }, None).unwrap(); + index_scheduler + .register(KindWithContent::IndexSwap { swaps: vec![] }, None, false) + .unwrap(); handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); } @@ -2585,7 +2622,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_n_successful_batches(4); @@ -2603,6 +2640,7 @@ mod tests { ], }, None, + false, ) .unwrap_err(); snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); @@ -2621,6 +2659,7 @@ mod tests { ], }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -2652,10 +2691,11 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler)); @@ -2680,7 +2720,7 @@ mod tests { }, ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2697,7 +2737,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None) + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2711,6 +2751,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); @@ -2726,7 +2767,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None) + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2740,6 +2781,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); @@ -2770,7 +2812,7 @@ mod tests { replace_document_import_task("wolfo", None, 2, documents_count2), ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_one_successful_batch(); @@ -2784,6 +2826,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0, 1, 2]), }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); @@ -2822,6 +2865,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2872,6 +2916,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2924,6 +2969,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2977,6 +3023,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3031,6 +3078,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3076,13 +3124,13 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); handle.advance_n_successful_batches(3); @@ -3140,11 +3188,11 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3363,17 +3411,17 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3449,20 +3497,20 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind, None).unwrap(); + let _ = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind, None).unwrap(); + let _ = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); handle.advance_n_successful_batches(1); let kind = KindWithContent::TaskCancelation { query: "test_query".to_string(), tasks: [0, 1, 2, 3].into_iter().collect(), }; - let task_cancelation = index_scheduler.register(kind, None).unwrap(); + let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); handle.advance_n_successful_batches(1); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3497,7 +3545,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); handle.advance_one_failed_batch(); @@ -3532,6 +3580,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3573,6 +3622,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3632,6 +3682,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3683,6 +3734,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3714,6 +3766,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3743,6 +3796,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3779,6 +3833,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3808,6 +3863,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3848,6 +3904,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3878,6 +3935,7 @@ mod tests { allow_index_creation, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3934,6 +3992,7 @@ mod tests { allow_index_creation, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3989,6 +4048,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4053,6 +4113,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4113,6 +4174,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4197,6 +4259,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4283,6 +4346,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4337,7 +4401,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); @@ -4360,6 +4424,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4368,6 +4433,7 @@ mod tests { let result = index_scheduler.register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ); if result.is_err() { break; @@ -4381,6 +4447,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); @@ -4392,6 +4459,7 @@ mod tests { .register( KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, None, + false, ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); @@ -4403,6 +4471,7 @@ mod tests { .register( KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4412,6 +4481,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4428,6 +4498,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4436,6 +4507,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4446,12 +4518,14 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); index_scheduler .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); @@ -4507,6 +4581,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4515,6 +4590,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4525,12 +4601,14 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); index_scheduler .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); @@ -4555,11 +4633,11 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" { @@ -4709,11 +4787,11 @@ mod tests { query: "cancel dump".to_owned(), tasks: RoaringBitmap::from_iter([0]), }; - let _ = index_scheduler.register(dump_creation, None).unwrap(); + let _ = index_scheduler.register(dump_creation, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - let _ = index_scheduler.register(dump_cancellation, None).unwrap(); + let _ = index_scheduler.register(dump_cancellation, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); @@ -4727,15 +4805,86 @@ mod tests { let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, None).unwrap(); + let task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(task.uid, @"0"); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, Some(12)).unwrap(); + let task = index_scheduler.register(kind, Some(12), false).unwrap(); snapshot!(task.uid, @"12"); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let error = index_scheduler.register(kind, Some(5)).unwrap_err(); + let error = index_scheduler.register(kind, Some(5), false).unwrap_err(); snapshot!(error, @"Received bad task id: 5 should be >= to 13."); } + + #[test] + fn dry_run() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None, true).unwrap(); + snapshot!(task.uid, @"0"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r###" + ### Autobatching Enabled = true + ### Processing Tasks: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "###); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12), true).unwrap(); + snapshot!(task.uid, @"12"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r###" + ### Autobatching Enabled = true + ### Processing Tasks: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "###); + } } diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 500d56079..de26b771e 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -251,7 +251,9 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< .name(String::from("register-snapshot-tasks")) .spawn(move || loop { thread::sleep(snapshot_delay); - if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation, None) { + if let Err(e) = + index_scheduler.register(KindWithContent::SnapshotCreation, None, false) + { error!("Error while registering snapshot: {}", e); } }) diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 56231a759..7f3cd06a5 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -11,7 +11,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -32,8 +32,11 @@ pub async fn create_dump( instance_uid: analytics.instance_uid().cloned(), }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create dump"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 5bf7eaa8d..a74bbff49 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -36,7 +36,9 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use crate::routes::{ + get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, +}; use crate::search::parse_filter; use crate::Opt; @@ -133,8 +135,11 @@ pub async fn delete_document( documents_ids: vec![document_id], }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } @@ -282,6 +287,7 @@ pub async fn replace_documents( let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -291,6 +297,7 @@ pub async fn replace_documents( body, IndexDocumentsMethod::ReplaceDocuments, uid, + dry_run, allow_index_creation, ) .await?; @@ -317,6 +324,7 @@ pub async fn update_documents( let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -326,6 +334,7 @@ pub async fn update_documents( body, IndexDocumentsMethod::UpdateDocuments, uid, + dry_run, allow_index_creation, ) .await?; @@ -344,6 +353,7 @@ async fn document_addition( mut body: Payload, method: IndexDocumentsMethod, task_id: Option, + dry_run: bool, allow_index_creation: bool, ) -> Result { let format = match ( @@ -376,7 +386,7 @@ async fn document_addition( } }; - let (uuid, mut update_file) = index_scheduler.create_update_file()?; + let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; let temp_file = match tempfile() { Ok(file) => file, @@ -460,7 +470,9 @@ async fn document_addition( }; let scheduler = index_scheduler.clone(); - let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id)).await? { + let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id, dry_run)) + .await? + { Ok(task) => task, Err(e) => { index_scheduler.delete_update_file(uuid)?; @@ -492,8 +504,11 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete documents by batch"); Ok(HttpResponse::Accepted().json(task)) @@ -530,8 +545,11 @@ pub async fn delete_documents_by_filter( let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete documents by filter"); Ok(HttpResponse::Accepted().json(task)) @@ -549,8 +567,11 @@ pub async fn clear_all_documents( let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete all documents"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 59a1f0e64..59fa02dff 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -22,6 +22,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::is_dry_run; use crate::Opt; pub mod documents; @@ -140,8 +141,11 @@ pub async fn create_index( let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create index"); Ok(HttpResponse::Accepted().json(task)) @@ -211,8 +215,11 @@ pub async fn update_index( }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Update index"); Ok(HttpResponse::Accepted().json(task)) @@ -227,8 +234,11 @@ pub async fn delete_index( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete index"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 6e43bce41..c71d83279 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -15,7 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; #[macro_export] @@ -36,7 +36,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; use $crate::Opt; - use $crate::routes::{get_task_id, SummarizedTaskView}; + use $crate::routes::{is_dry_run, get_task_id, SummarizedTaskView}; pub async fn delete( index_scheduler: GuardedData< @@ -61,8 +61,9 @@ macro_rules! make_setting_route { allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) .await?? .into(); @@ -112,8 +113,9 @@ macro_rules! make_setting_route { allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) .await?? .into(); @@ -776,8 +778,11 @@ pub async fn update_all( allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Update all settings"); Ok(HttpResponse::Accepted().json(task)) @@ -815,8 +820,11 @@ pub async fn delete_all( allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete all settings"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 2dc89b150..f98d4b4de 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -77,6 +77,25 @@ pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, Respo Ok(task_id) } +pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result { + if !opt.experimental_ha_parameters { + return Ok(false); + } + Ok(req + .headers() + .get("DryRun") + .map(|header| { + header.to_str().map_err(|e| { + ResponseError::from_msg( + format!("DryRun is not a valid utf-8 string: {e}"), + Code::BadRequest, + ) + }) + }) + .transpose()? + .map_or(false, |s| s.to_lowercase() == "true")) +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct SummarizedTaskView { diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index 6b3178126..84673729f 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -10,7 +10,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -27,8 +27,11 @@ pub async fn create_snapshot( let task = KindWithContent::SnapshotCreation; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create snapshot"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index f8adeeb18..51a7b0707 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -10,7 +10,7 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde_json::json; -use super::{get_task_id, SummarizedTaskView}; +use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -63,7 +63,10 @@ pub async fn swap_indexes( let task = KindWithContent::IndexSwap { swaps }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); Ok(HttpResponse::Accepted().json(task)) } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 279b57e3d..f35d97fe6 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -18,7 +18,7 @@ use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; -use super::{get_task_id, SummarizedTaskView}; +use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; @@ -200,8 +200,10 @@ async fn cancel_tasks( KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = - task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; + task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid, dry_run)) + .await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) @@ -248,7 +250,9 @@ async fn delete_tasks( KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; let uid = get_task_id(&req, &opt)?; - let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; + let dry_run = is_dry_run(&req, &opt)?; + let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid, dry_run)) + .await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) From e1a3eed1eb90a660535b67fa11bf8843e309198a Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 12:30:28 +0100 Subject: [PATCH 07/32] update the discussion link --- meilisearch/src/option.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 4dd17d546..f932abac6 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -319,8 +319,7 @@ pub struct Opt { pub experimental_enable_logs_route: bool, /// Enable multiple features that helps you to run meilisearch in a high availability context. - /// TODO: TAMO: Update the discussion link - /// For more information, see: + /// For more information, see: /// /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now /// - Lets you specify a custom task ID upon registering a task From 693ba8dd15280fe4be1f06fbc27465ff6d7fa551 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 14:33:40 +0100 Subject: [PATCH 08/32] rename the cli parameter --- meilisearch/src/analytics/segment_analytics.rs | 6 +++--- meilisearch/src/lib.rs | 2 +- meilisearch/src/option.rs | 14 +++++++------- meilisearch/src/routes/mod.rs | 4 ++-- meilisearch/tests/index/create_index.rs | 3 ++- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index a78b0d11b..8bb7e8d81 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -250,7 +250,7 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_enable_metrics: bool, - experimental_ha_parameters: bool, + experimental_replication_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, @@ -289,7 +289,7 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, - experimental_ha_parameters, + experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, @@ -337,7 +337,7 @@ impl From for Infos { Self { env, experimental_enable_metrics, - experimental_ha_parameters, + experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, db_path: db_path != PathBuf::from("./data.ms"), diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index de26b771e..1ab161564 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -288,7 +288,7 @@ fn open_or_create_database_unchecked( enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, indexer_config: (&opt.indexer_options).try_into()?, autobatching_enabled: true, - cleanup_enabled: !opt.experimental_ha_parameters, + cleanup_enabled: !opt.experimental_replication_parameters, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index f932abac6..e6ff4f2a1 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -51,7 +51,7 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP"; const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; -const MEILI_EXPERIMENTAL_HA_PARAMETERS: &str = "MEILI_EXPERIMENTAL_HA_PARAMETERS"; +const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = @@ -318,15 +318,15 @@ pub struct Opt { #[serde(default)] pub experimental_enable_logs_route: bool, - /// Enable multiple features that helps you to run meilisearch in a high availability context. + /// Enable multiple features that helps you to run meilisearch in a replicated context. /// For more information, see: /// /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now /// - Lets you specify a custom task ID upon registering a task /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed) - #[clap(long, env = MEILI_EXPERIMENTAL_HA_PARAMETERS)] + #[clap(long, env = MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS)] #[serde(default)] - pub experimental_ha_parameters: bool, + pub experimental_replication_parameters: bool, /// Experimental RAM reduction during indexing, do not use in production, see: #[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)] @@ -434,7 +434,7 @@ impl Opt { no_analytics, experimental_enable_metrics, experimental_enable_logs_route, - experimental_ha_parameters, + experimental_replication_parameters, experimental_reduce_indexing_memory_usage, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); @@ -492,8 +492,8 @@ impl Opt { experimental_enable_metrics.to_string(), ); export_to_env_if_not_present( - MEILI_EXPERIMENTAL_HA_PARAMETERS, - experimental_ha_parameters.to_string(), + MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS, + experimental_replication_parameters.to_string(), ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE, diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index f98d4b4de..249103e12 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -47,7 +47,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { } pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, ResponseError> { - if !opt.experimental_ha_parameters { + if !opt.experimental_replication_parameters { return Ok(None); } let task_id = req @@ -78,7 +78,7 @@ pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, Respo } pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result { - if !opt.experimental_ha_parameters { + if !opt.experimental_replication_parameters { return Ok(false); } Ok(req diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index 7a678624c..b309b83c6 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -205,7 +205,8 @@ async fn error_create_with_invalid_index_uid() { async fn send_task_id() { let temp = tempfile::tempdir().unwrap(); - let options = Opt { experimental_ha_parameters: true, ..default_settings(temp.path()) }; + let options = + Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) }; let server = Server::new_with_options(options).await.unwrap(); let app = server.init_web_app().await; From c2e2003a808f3526ccdb52d8d5033c8e9fc310aa Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 22 Feb 2024 15:51:47 +0100 Subject: [PATCH 09/32] create a test with the dry-run parameter enabled --- meilisearch/tests/common/index.rs | 9 +--- meilisearch/tests/documents/add_documents.rs | 49 +++++++++++++++++++- meilisearch/tests/documents/errors.rs | 41 ++++++++++------ 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 4992eeb13..16fc10e98 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -100,16 +100,11 @@ impl Index<'_> { pub async fn raw_add_documents( &self, payload: &str, - content_type: Option<&str>, + headers: Vec<(&str, &str)>, query_parameter: &str, ) -> (Value, StatusCode) { let url = format!("/indexes/{}/documents{}", urlencode(self.uid.as_ref()), query_parameter); - - if let Some(content_type) = content_type { - self.service.post_str(url, payload, vec![("Content-Type", content_type)]).await - } else { - self.service.post_str(url, payload, Vec::new()).await - } + self.service.post_str(url, payload, headers).await } pub async fn update_documents( diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index 9733f7741..e6af85229 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -1,10 +1,11 @@ use actix_web::test; use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, Server, Value}; +use crate::common::{default_settings, GetAllDocumentsOptions, Server, Value}; use crate::json; /// This is the basic usage of our API and every other tests uses the content-type application/json @@ -2157,3 +2158,49 @@ async fn batch_several_documents_addition() { assert_eq!(code, 200, "failed with `{}`", response); assert_eq!(response["results"].as_array().unwrap().len(), 120); } + +#[actix_rt::test] +async fn dry_register_file() { + let temp = tempfile::tempdir().unwrap(); + + let options = + Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) }; + let server = Server::new_with_options(options).await.unwrap(); + let index = server.index("tamo"); + + let documents = r#" + { + "id": "12", + "doggo": "kefir" + } + "#; + + let (response, code) = index + .raw_add_documents( + documents, + vec![("Content-Type", "application/json"), ("DryRun", "true")], + "", + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "tamo", + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "[date]" + } + "###); + snapshot!(code, @"202 Accepted"); + + let (response, code) = index.get_task(response.uid()).await; + snapshot!(response, @r###" + { + "message": "Task `0` not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#task_not_found" + } + "###); + snapshot!(code, @"404 Not Found"); +} diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index bd06aabce..cd2d89813 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -209,7 +209,8 @@ async fn replace_documents_missing_payload() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", Some("application/json"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "application/json")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -220,7 +221,8 @@ async fn replace_documents_missing_payload() { } "###); - let (response, code) = index.raw_add_documents("", Some("application/x-ndjson"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -231,7 +233,8 @@ async fn replace_documents_missing_payload() { } "###); - let (response, code) = index.raw_add_documents("", Some("text/csv"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "text/csv")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -287,7 +290,7 @@ async fn replace_documents_missing_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", None, "").await; + let (response, code) = index.raw_add_documents("", Vec::new(), "").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -299,7 +302,7 @@ async fn replace_documents_missing_content_type() { "###); // even with a csv delimiter specified this error is triggered first - let (response, code) = index.raw_add_documents("", None, "?csvDelimiter=;").await; + let (response, code) = index.raw_add_documents("", Vec::new(), "?csvDelimiter=;").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -345,7 +348,7 @@ async fn replace_documents_bad_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", Some("doggo"), "").await; + let (response, code) = index.raw_add_documents("", vec![("Content-Type", "doggo")], "").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -379,8 +382,9 @@ async fn replace_documents_bad_csv_delimiter() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter") + .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -391,8 +395,9 @@ async fn replace_documents_bad_csv_delimiter() { } "###); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter=doggo").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=doggo") + .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -404,7 +409,11 @@ async fn replace_documents_bad_csv_delimiter() { "###); let (response, code) = index - .raw_add_documents("", Some("application/json"), &format!("?csvDelimiter={}", encode("🍰"))) + .raw_add_documents( + "", + vec![("Content-Type", "application/json")], + &format!("?csvDelimiter={}", encode("🍰")), + ) .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" @@ -469,8 +478,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter=a").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=a") + .await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -481,8 +491,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() { } "###); - let (response, code) = - index.raw_add_documents("", Some("application/x-ndjson"), "?csvDelimiter=a").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "?csvDelimiter=a") + .await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { From eb90f0b4fbf2ae1da9d9461f4480b764f59745bd Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 22 Feb 2024 18:42:12 +0100 Subject: [PATCH 10/32] fix and remove the file-store hack of /dev/null --- file-store/src/lib.rs | 56 +++++++++------------ index-scheduler/src/lib.rs | 54 ++++++++++---------- meilisearch-types/src/document_formats.rs | 16 +++--- meilisearch/src/routes/indexes/documents.rs | 8 ++- 4 files changed, 63 insertions(+), 71 deletions(-) diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index e3851a2df..15c4168bc 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -1,5 +1,5 @@ use std::fs::File as StdFile; -use std::ops::{Deref, DerefMut}; +use std::io::Write; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -22,20 +22,6 @@ pub enum Error { pub type Result = std::result::Result; -impl Deref for File { - type Target = NamedTempFile; - - fn deref(&self) -> &Self::Target { - &self.file - } -} - -impl DerefMut for File { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.file - } -} - #[derive(Clone, Debug)] pub struct FileStore { path: PathBuf, @@ -56,7 +42,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::new_v4(); let path = self.path.join(uuid.to_string()); - let update_file = File { dry: false, file, path }; + let update_file = File { file: Some(file), path }; Ok((uuid, update_file)) } @@ -67,7 +53,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::from_u128(uuid); let path = self.path.join(uuid.to_string()); - let update_file = File { dry: false, file, path }; + let update_file = File { file: Some(file), path }; Ok((uuid, update_file)) } @@ -135,33 +121,41 @@ impl FileStore { } pub struct File { - dry: bool, path: PathBuf, - file: NamedTempFile, + file: Option, } impl File { pub fn dry_file() -> Result { - #[cfg(target_family = "unix")] - let path = PathBuf::from_str("/dev/null").unwrap(); - #[cfg(target_family = "windows")] - let path = PathBuf::from_str("\\Device\\Null").unwrap(); - - Ok(Self { - dry: true, - path: path.clone(), - file: tempfile::Builder::new().make(|_| std::fs::File::create(path.clone()))?, - }) + Ok(Self { path: PathBuf::new(), file: None }) } pub fn persist(self) -> Result<()> { - if !self.dry { - self.file.persist(&self.path)?; + if let Some(file) = self.file { + file.persist(&self.path)?; } Ok(()) } } +impl Write for File { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if let Some(file) = self.file.as_mut() { + file.write(buf) + } else { + Ok(buf.len()) + } + } + + fn flush(&mut self) -> std::io::Result<()> { + if let Some(file) = self.file.as_mut() { + file.flush() + } else { + Ok(()) + } + } +} + #[cfg(test)] mod test { use std::io::Write; diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5d0ce9eb9..1c3b93bce 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1550,7 +1550,7 @@ impl<'a> Dump<'a> { let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { let (uuid, mut file) = self.index_scheduler.create_update_file(false)?; - let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); + let mut builder = DocumentsBatchBuilder::new(&mut file); for doc in content_file { builder.append_json_object(&doc?)?; } @@ -1734,7 +1734,7 @@ pub struct IndexStats { #[cfg(test)] mod tests { - use std::io::{BufWriter, Seek, Write}; + use std::io::{BufWriter, Write}; use std::time::Instant; use big_s::S; @@ -1882,7 +1882,7 @@ mod tests { /// Adapting to the new json reading interface pub fn read_json( bytes: &[u8], - write: impl Write + Seek, + write: impl Write, ) -> std::result::Result { let temp_file = NamedTempFile::new().unwrap(); let mut buffer = BufWriter::new(temp_file.reopen().unwrap()); @@ -1909,7 +1909,7 @@ mod tests { ); let (_uuid, mut file) = index_scheduler.create_update_file_with_uuid(file_uuid).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); (file, documents_count) } @@ -2321,7 +2321,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2366,7 +2366,7 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2406,7 +2406,7 @@ mod tests { ]"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2472,7 +2472,7 @@ mod tests { ]"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2678,7 +2678,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2852,7 +2852,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2903,7 +2903,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2956,7 +2956,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3010,7 +3010,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3065,7 +3065,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3567,7 +3567,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3609,7 +3609,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3669,7 +3669,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3721,7 +3721,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3783,7 +3783,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3850,7 +3850,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3922,7 +3922,7 @@ mod tests { let allow_index_creation = i % 2 != 0; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3979,7 +3979,7 @@ mod tests { let allow_index_creation = i % 2 != 0; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -4033,7 +4033,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4098,7 +4098,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4159,7 +4159,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4244,7 +4244,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4331,7 +4331,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 0f1d995f9..50dc5bad4 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -1,6 +1,6 @@ use std::fmt::{self, Debug, Display}; use std::fs::File; -use std::io::{self, Seek, Write}; +use std::io::{self, BufWriter, Write}; use std::marker::PhantomData; use memmap2::MmapOptions; @@ -104,8 +104,8 @@ impl ErrorCode for DocumentFormatError { } /// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref()); builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?; @@ -116,9 +116,9 @@ pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result Ok(count as u64) } -/// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_json(file: &File, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +/// Reads JSON from temporary file and write an obkv batch to writer. +pub fn read_json(file: &File, writer: impl Write) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; let mut deserializer = serde_json::Deserializer::from_slice(&mmap); @@ -151,8 +151,8 @@ pub fn read_json(file: &File, writer: impl Write + Seek) -> Result { } /// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_ndjson(file: &File, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +pub fn read_ndjson(file: &File, writer: impl Write) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; for result in serde_json::Deserializer::from_slice(&mmap).into_iter() { diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index a74bbff49..43fab1dae 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -425,11 +425,9 @@ async fn document_addition( let read_file = buffer.into_inner().into_std().await; let documents_count = tokio::task::spawn_blocking(move || { let documents_count = match format { - PayloadType::Json => read_json(&read_file, update_file.as_file_mut())?, - PayloadType::Csv { delimiter } => { - read_csv(&read_file, update_file.as_file_mut(), delimiter)? - } - PayloadType::Ndjson => read_ndjson(&read_file, update_file.as_file_mut())?, + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, + PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, }; // we NEED to persist the file here because we moved the `udpate_file` in another task. update_file.persist()?; From d3004d8040a9ddadbea6be97e8059c2d68ab50b6 Mon Sep 17 00:00:00 2001 From: Jakob Klemm Date: Sun, 3 Mar 2024 01:11:25 +0100 Subject: [PATCH 11/32] Implemented Ollama as an embeddings provider Initial prototype of Ollama embeddings actually working, error handlign / retries still missing. Allow model to be any String and require dimensions parameter Fixed rustfmt formatting issues There were some formatting issues in the initial PR and this should not make the changes comply with the Rust style guidelines Because I accidentally didn't follow the style guide for commits in my commit messages I squashed them into one to comply --- meilisearch/src/routes/indexes/settings.rs | 1 + milli/src/update/settings.rs | 7 + milli/src/vector/error.rs | 26 +++ milli/src/vector/mod.rs | 18 ++ milli/src/vector/ollama.rs | 255 +++++++++++++++++++++ milli/src/vector/openai.rs | 20 +- milli/src/vector/settings.rs | 38 ++- 7 files changed, 350 insertions(+), 15 deletions(-) create mode 100644 milli/src/vector/ollama.rs diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index c71d83279..c782e78cb 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -604,6 +604,7 @@ fn embedder_analytics( EmbedderSource::OpenAi => sources.insert("openAi"), EmbedderSource::HuggingFace => sources.insert("huggingFace"), EmbedderSource::UserProvided => sources.insert("userProvided"), + EmbedderSource::Ollama => sources.insert("ollama"), }; } }; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3cad79467..df273b023 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1178,6 +1178,13 @@ pub fn validate_embedding_settings( } } } + EmbedderSource::Ollama => { + // Existence & corrent dimensions of models cannot easily be checked here. + check_set(&dimensions, "dimensions", inferred_source, name)?; + check_set(&model, "model", inferred_source, name)?; + check_unset(&api_key, "apiKey", inferred_source, name)?; + check_unset(&revision, "revision", inferred_source, name)?; + } EmbedderSource::HuggingFace => { check_unset(&api_key, "apiKey", inferred_source, name)?; check_unset(&dimensions, "dimensions", inferred_source, name)?; diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 3673c85e3..ffdda42ca 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use hf_hub::api::sync::ApiError; +use super::ollama::OllamaError; use crate::error::FaultSource; use crate::vector::openai::OpenAiError; @@ -71,6 +72,15 @@ pub enum EmbedErrorKind { OpenAiRuntimeInit(std::io::Error), #[error("initializing web client for sending embedding requests failed: {0}")] InitWebClient(reqwest::Error), + // Dedicated Ollama error kinds, might have to merge them into one cohesive error type for all backends. + #[error("unexpected response from Ollama: {0}")] + OllamaUnexpected(reqwest::Error), + #[error("sent too many requests to Ollama: {0}")] + OllamaTooManyRequests(OllamaError), + #[error("received internal error from Ollama: {0}")] + OllamaInternalServerError(OllamaError), + #[error("received unhandled HTTP status code {0} from Ollama")] + OllamaUnhandledStatusCode(u16), } impl EmbedError { @@ -129,6 +139,22 @@ impl EmbedError { pub fn openai_initialize_web_client(inner: reqwest::Error) -> Self { Self { kind: EmbedErrorKind::InitWebClient(inner), fault: FaultSource::Runtime } } + + pub fn ollama_unexpected(inner: reqwest::Error) -> EmbedError { + Self { kind: EmbedErrorKind::OllamaUnexpected(inner), fault: FaultSource::Bug } + } + + pub(crate) fn ollama_too_many_requests(inner: OllamaError) -> EmbedError { + Self { kind: EmbedErrorKind::OllamaTooManyRequests(inner), fault: FaultSource::Runtime } + } + + pub(crate) fn ollama_internal_server_error(inner: OllamaError) -> EmbedError { + Self { kind: EmbedErrorKind::OllamaInternalServerError(inner), fault: FaultSource::Runtime } + } + + pub(crate) fn ollama_unhandled_status_code(code: u16) -> EmbedError { + Self { kind: EmbedErrorKind::OllamaUnhandledStatusCode(code), fault: FaultSource::Bug } + } } #[derive(Debug, thiserror::Error)] diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 99b7bff7e..635c72878 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -10,6 +10,8 @@ pub mod manual; pub mod openai; pub mod settings; +pub mod ollama; + pub use self::error::Error; pub type Embedding = Vec; @@ -76,6 +78,7 @@ pub enum Embedder { HuggingFace(hf::Embedder), OpenAi(openai::Embedder), UserProvided(manual::Embedder), + Ollama(ollama::Embedder), } #[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] @@ -127,6 +130,7 @@ impl IntoIterator for EmbeddingConfigs { pub enum EmbedderOptions { HuggingFace(hf::EmbedderOptions), OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), UserProvided(manual::EmbedderOptions), } @@ -144,6 +148,10 @@ impl EmbedderOptions { pub fn openai(api_key: Option) -> Self { Self::OpenAi(openai::EmbedderOptions::with_default_model(api_key)) } + + pub fn ollama() -> Self { + Self::Ollama(ollama::EmbedderOptions::with_default_model()) + } } impl Embedder { @@ -151,6 +159,7 @@ impl Embedder { Ok(match options { EmbedderOptions::HuggingFace(options) => Self::HuggingFace(hf::Embedder::new(options)?), EmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?), + EmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?), EmbedderOptions::UserProvided(options) => { Self::UserProvided(manual::Embedder::new(options)) } @@ -167,6 +176,10 @@ impl Embedder { let client = embedder.new_client()?; embedder.embed(texts, &client).await } + Embedder::Ollama(embedder) => { + let client = embedder.new_client()?; + embedder.embed(texts, &client).await + } Embedder::UserProvided(embedder) => embedder.embed(texts), } } @@ -181,6 +194,7 @@ impl Embedder { match self { Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks), + Embedder::Ollama(embedder) => embedder.embed_chunks(text_chunks), Embedder::UserProvided(embedder) => embedder.embed_chunks(text_chunks), } } @@ -189,6 +203,7 @@ impl Embedder { match self { Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), + Embedder::Ollama(embedder) => embedder.chunk_count_hint(), Embedder::UserProvided(_) => 1, } } @@ -197,6 +212,7 @@ impl Embedder { match self { Embedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), Embedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), Embedder::UserProvided(_) => 1, } } @@ -205,6 +221,7 @@ impl Embedder { match self { Embedder::HuggingFace(embedder) => embedder.dimensions(), Embedder::OpenAi(embedder) => embedder.dimensions(), + Embedder::Ollama(embedder) => embedder.dimensions(), Embedder::UserProvided(embedder) => embedder.dimensions(), } } @@ -213,6 +230,7 @@ impl Embedder { match self { Embedder::HuggingFace(embedder) => embedder.distribution(), Embedder::OpenAi(embedder) => embedder.distribution(), + Embedder::Ollama(embedder) => embedder.distribution(), Embedder::UserProvided(_embedder) => None, } } diff --git a/milli/src/vector/ollama.rs b/milli/src/vector/ollama.rs new file mode 100644 index 000000000..a83022dbd --- /dev/null +++ b/milli/src/vector/ollama.rs @@ -0,0 +1,255 @@ +// Copied from "openai.rs" with the sections I actually understand changed for Ollama. +// The common components of the Ollama and OpenAI interfaces might need to be extracted. + +use std::fmt::Display; + +use reqwest::StatusCode; + +use super::error::{EmbedError, NewEmbedderError}; +use super::openai::Retry; +use super::{DistributionShift, Embedding, Embeddings}; + +#[derive(Debug)] +pub struct Embedder { + headers: reqwest::header::HeaderMap, + options: EmbedderOptions, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub struct EmbedderOptions { + pub embedding_model: EmbeddingModel, + pub dimensions: usize, +} + +#[derive( + Debug, Clone, Hash, PartialEq, Eq, serde::Serialize, serde::Deserialize, deserr::Deserr, +)] +#[deserr(deny_unknown_fields)] +pub struct EmbeddingModel { + name: String, +} + +#[derive(Debug, serde::Serialize)] +struct OllamaRequest<'a> { + model: &'a str, + prompt: &'a str, +} + +#[derive(Debug, serde::Deserialize)] +struct OllamaResponse { + embedding: Embedding, +} + +#[derive(Debug, serde::Deserialize)] +struct OllamaErrorResponse { + error: OllamaError, +} + +#[derive(Debug, serde::Deserialize)] +pub struct OllamaError { + message: String, + // type: String, + code: Option, +} + +impl EmbeddingModel { + pub fn max_token(&self) -> usize { + // this might not be the same for all models + 8192 + } + + pub fn default_dimensions(&self) -> usize { + // Dimensions for nomic-embed-text + 768 + } + + pub fn name(&self) -> String { + self.name.clone() + } + + pub fn from_name(name: &str) -> Self { + Self { name: name.to_string() } + } + + pub fn supports_overriding_dimensions(&self) -> bool { + false + } +} + +impl Default for EmbeddingModel { + fn default() -> Self { + Self { name: "nomic-embed-text".to_string() } + } +} + +impl EmbedderOptions { + pub fn with_default_model() -> Self { + Self { embedding_model: Default::default(), dimensions: 768 } + } + + pub fn with_embedding_model(embedding_model: EmbeddingModel, dimensions: usize) -> Self { + Self { embedding_model, dimensions } + } +} + +impl Embedder { + pub fn new_client(&self) -> Result { + reqwest::ClientBuilder::new() + .default_headers(self.headers.clone()) + .build() + .map_err(EmbedError::openai_initialize_web_client) + } + + pub fn new(options: EmbedderOptions) -> Result { + let mut headers = reqwest::header::HeaderMap::new(); + headers.insert( + reqwest::header::CONTENT_TYPE, + reqwest::header::HeaderValue::from_static("application/json"), + ); + + Ok(Self { options, headers }) + } + + async fn check_response(response: reqwest::Response) -> Result { + if !response.status().is_success() { + // Not the same number of possible error cases covered as with OpenAI. + match response.status() { + StatusCode::TOO_MANY_REQUESTS => { + let error_response: OllamaErrorResponse = response + .json() + .await + .map_err(EmbedError::ollama_unexpected) + .map_err(Retry::retry_later)?; + + return Err(Retry::rate_limited(EmbedError::ollama_too_many_requests( + error_response.error, + ))); + } + StatusCode::SERVICE_UNAVAILABLE => { + let error_response: OllamaErrorResponse = response + .json() + .await + .map_err(EmbedError::ollama_unexpected) + .map_err(Retry::retry_later)?; + return Err(Retry::retry_later(EmbedError::ollama_internal_server_error( + error_response.error, + ))); + } + code => { + return Err(Retry::give_up(EmbedError::ollama_unhandled_status_code( + code.as_u16(), + ))); + } + } + } + Ok(response) + } + + pub async fn embed( + &self, + texts: Vec, + client: &reqwest::Client, + ) -> Result>, EmbedError> { + // Ollama only embedds one document at a time. + let mut results = Vec::with_capacity(texts.len()); + + // The retry loop is inside the texts loop, might have to switch that around + for text in texts { + // Retries copied from openai.rs + for attempt in 0..7 { + let retry_duration = match self.try_embed(&text, client).await { + Ok(result) => { + results.push(result); + break; + } + Err(retry) => { + tracing::warn!("Failed: {}", retry.error); + retry.into_duration(attempt) + } + }?; + tracing::warn!( + "Attempt #{}, retrying after {}ms.", + attempt, + retry_duration.as_millis() + ); + tokio::time::sleep(retry_duration).await; + } + } + + Ok(results) + } + + async fn try_embed( + &self, + text: &str, + client: &reqwest::Client, + ) -> Result, Retry> { + let request = OllamaRequest { model: &self.options.embedding_model.name(), prompt: text }; + let response = client + .post(get_ollama_path()) + .json(&request) + .send() + .await + .map_err(EmbedError::openai_network) + .map_err(Retry::retry_later)?; + + let response = Self::check_response(response).await?; + + let response: OllamaResponse = response + .json() + .await + .map_err(EmbedError::openai_unexpected) + .map_err(Retry::retry_later)?; + + tracing::trace!("response: {:?}", response.embedding); + + let embedding = Embeddings::from_single_embedding(response.embedding); + Ok(embedding) + } + + pub fn embed_chunks( + &self, + text_chunks: Vec>, + ) -> Result>>, EmbedError> { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_io() + .enable_time() + .build() + .map_err(EmbedError::openai_runtime_init)?; + let client = self.new_client()?; + rt.block_on(futures::future::try_join_all( + text_chunks.into_iter().map(|prompts| self.embed(prompts, &client)), + )) + } + + // Defaults copied from openai.rs + pub fn chunk_count_hint(&self) -> usize { + 10 + } + + pub fn prompt_count_in_chunk_hint(&self) -> usize { + 10 + } + + pub fn dimensions(&self) -> usize { + self.options.dimensions + } + + pub fn distribution(&self) -> Option { + None + } +} + +impl Display for OllamaError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.code { + Some(code) => write!(f, "{} ({})", self.message, code), + None => write!(f, "{}", self.message), + } + } +} + +fn get_ollama_path() -> String { + // Important: Hostname not enough, has to be entire path to embeddings endpoint + std::env::var("MEILI_OLLAMA_URL").unwrap_or("http://localhost:11434/api/embeddings".to_string()) +} diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index cbddddfb7..ae0fdddf8 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -429,12 +429,12 @@ impl Embedder { // retrying in case of failure -struct Retry { - error: EmbedError, +pub struct Retry { + pub error: EmbedError, strategy: RetryStrategy, } -enum RetryStrategy { +pub enum RetryStrategy { GiveUp, Retry, RetryTokenized, @@ -442,23 +442,23 @@ enum RetryStrategy { } impl Retry { - fn give_up(error: EmbedError) -> Self { + pub fn give_up(error: EmbedError) -> Self { Self { error, strategy: RetryStrategy::GiveUp } } - fn retry_later(error: EmbedError) -> Self { + pub fn retry_later(error: EmbedError) -> Self { Self { error, strategy: RetryStrategy::Retry } } - fn retry_tokenized(error: EmbedError) -> Self { + pub fn retry_tokenized(error: EmbedError) -> Self { Self { error, strategy: RetryStrategy::RetryTokenized } } - fn rate_limited(error: EmbedError) -> Self { + pub fn rate_limited(error: EmbedError) -> Self { Self { error, strategy: RetryStrategy::RetryAfterRateLimit } } - fn into_duration(self, attempt: u32) -> Result { + pub fn into_duration(self, attempt: u32) -> Result { match self.strategy { RetryStrategy::GiveUp => Err(self.error), RetryStrategy::Retry => Ok(tokio::time::Duration::from_millis((10u64).pow(attempt))), @@ -469,11 +469,11 @@ impl Retry { } } - fn must_tokenize(&self) -> bool { + pub fn must_tokenize(&self) -> bool { matches!(self.strategy, RetryStrategy::RetryTokenized) } - fn into_error(self) -> EmbedError { + pub fn into_error(self) -> EmbedError { self.error } } diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 834a1c81d..5595f60e3 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,7 +1,7 @@ use deserr::Deserr; use serde::{Deserialize, Serialize}; -use super::openai; +use super::{ollama, openai}; use crate::prompt::PromptData; use crate::update::Setting; use crate::vector::EmbeddingConfig; @@ -80,11 +80,17 @@ impl EmbeddingSettings { Self::SOURCE => { &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::UserProvided] } - Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + Self::MODEL => { + &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama] + } Self::REVISION => &[EmbedderSource::HuggingFace], Self::API_KEY => &[EmbedderSource::OpenAi], - Self::DIMENSIONS => &[EmbedderSource::OpenAi, EmbedderSource::UserProvided], - Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + Self::DIMENSIONS => { + &[EmbedderSource::OpenAi, EmbedderSource::UserProvided, EmbedderSource::Ollama] + } + Self::DOCUMENT_TEMPLATE => { + &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama] + } _other => unreachable!("unknown field"), } } @@ -101,6 +107,9 @@ impl EmbeddingSettings { EmbedderSource::HuggingFace => { &[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE] } + EmbedderSource::Ollama => { + &[Self::SOURCE, Self::MODEL, Self::DIMENSIONS, Self::DOCUMENT_TEMPLATE] + } EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS], } } @@ -134,6 +143,7 @@ pub enum EmbedderSource { #[default] OpenAi, HuggingFace, + Ollama, UserProvided, } @@ -143,6 +153,7 @@ impl std::fmt::Display for EmbedderSource { EmbedderSource::OpenAi => "openAi", EmbedderSource::HuggingFace => "huggingFace", EmbedderSource::UserProvided => "userProvided", + EmbedderSource::Ollama => "ollama", }; f.write_str(s) } @@ -192,7 +203,15 @@ impl From for EmbeddingSettings { model: Setting::Set(options.embedding_model.name().to_owned()), revision: Setting::NotSet, api_key: options.api_key.map(Setting::Set).unwrap_or_default(), - dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), + dimensions: Setting::Set(options.dimensions.unwrap_or_default()), + document_template: Setting::Set(prompt.template), + }, + super::EmbedderOptions::Ollama(options) => Self { + source: Setting::Set(EmbedderSource::Ollama), + model: Setting::Set(options.embedding_model.name().to_owned()), + revision: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(options.dimensions), document_template: Setting::Set(prompt.template), }, super::EmbedderOptions::UserProvided(options) => Self { @@ -229,6 +248,15 @@ impl From for EmbeddingConfig { } this.embedder_options = super::EmbedderOptions::OpenAi(options); } + EmbedderSource::Ollama => { + let mut options: ollama::EmbedderOptions = + super::ollama::EmbedderOptions::with_default_model(); + if let (Some(model), Some(dim)) = (model.set(), dimensions.set()) { + options.embedding_model = super::ollama::EmbeddingModel::from_name(&model); + options.dimensions = dim; + } + this.embedder_options = super::EmbedderOptions::Ollama(options); + } EmbedderSource::HuggingFace => { let mut options = super::hf::EmbedderOptions::default(); if let Some(model) = model.set() { From f07069094b628bab58f2fd1e896b3102f0450627 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 22:03:25 +0000 Subject: [PATCH 12/32] Bump mio from 0.8.9 to 0.8.11 Bumps [mio](https://github.com/tokio-rs/mio) from 0.8.9 to 0.8.11. - [Release notes](https://github.com/tokio-rs/mio/releases) - [Changelog](https://github.com/tokio-rs/mio/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/mio/compare/v0.8.9...v0.8.11) --- updated-dependencies: - dependency-name: mio dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 962f30853..7be74bd70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3954,9 +3954,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.9" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", From f053c280e112a7e36487fdd6fb42106cc4b0fa68 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Mar 2024 18:42:41 +0100 Subject: [PATCH 13/32] add tests when the field limit is reached --- meilisearch/tests/documents/add_documents.rs | 239 ++++++++++++++++++- 1 file changed, 229 insertions(+), 10 deletions(-) diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index e6af85229..b1262fa2d 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -1237,8 +1237,8 @@ async fn error_add_documents_missing_document_id() { } #[actix_rt::test] -#[ignore] // // TODO: Fix in an other PR: this does not provoke any error. -async fn error_document_field_limit_reached() { +#[should_panic] +async fn error_document_field_limit_reached_in_one_document() { let server = Server::new().await; let index = server.index("test"); @@ -1246,22 +1246,241 @@ async fn error_document_field_limit_reached() { let mut big_object = std::collections::HashMap::new(); big_object.insert("id".to_owned(), "wow"); - for i in 0..65535 { + for i in 0..(u16::MAX as usize + 1) { let key = i.to_string(); big_object.insert(key, "I am a text!"); } let documents = json!([big_object]); - let (_response, code) = index.update_documents(documents, Some("id")).await; - snapshot!(code, @"202"); + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"500 Internal Server Error"); - index.wait_task(0).await; - let (response, code) = index.get_task(0).await; - snapshot!(code, @"200"); + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); // Documents without a primary key are not accepted. - snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), - @""); + snapshot!(response, + @r###" + { + "uid": 1, + "indexUid": "test", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn error_document_field_limit_reached_over_multiple_documents() { + let server = Server::new().await; + let index = server.index("test"); + + index.create(Some("id")).await; + + let mut big_object = std::collections::HashMap::new(); + big_object.insert("id".to_owned(), "wow"); + for i in 0..(u16::MAX / 2) { + let key = i.to_string(); + big_object.insert(key, "I am a text!"); + } + + let documents = json!([big_object]); + + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"202 Accepted"); + + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); + snapshot!(response, + @r###" + { + "uid": 1, + "indexUid": "test", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let mut big_object = std::collections::HashMap::new(); + big_object.insert("id".to_owned(), "waw"); + for i in (u16::MAX as usize / 2)..(u16::MAX as usize + 1) { + let key = i.to_string(); + big_object.insert(key, "I am a text!"); + } + + let documents = json!([big_object]); + + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"202 Accepted"); + + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); + snapshot!(response, + @r###" + { + "uid": 2, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "A document cannot contain more than 65,535 fields.", + "code": "max_fields_limit_exceeded", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#max_fields_limit_exceeded" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn error_document_field_limit_reached_in_one_nested_document() { + let server = Server::new().await; + let index = server.index("test"); + + index.create(Some("id")).await; + + let mut nested = std::collections::HashMap::new(); + for i in 0..(u16::MAX as usize + 1) { + let key = i.to_string(); + nested.insert(key, "I am a text!"); + } + let mut big_object = std::collections::HashMap::new(); + big_object.insert("id".to_owned(), "wow"); + + let documents = json!([big_object]); + + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"202 Accepted"); + + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); + // Documents without a primary key are not accepted. + snapshot!(response, + @r###" + { + "uid": 1, + "indexUid": "test", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn error_document_field_limit_reached_over_multiple_documents_with_nested_fields() { + let server = Server::new().await; + let index = server.index("test"); + + index.create(Some("id")).await; + + let mut nested = std::collections::HashMap::new(); + for i in 0..(u16::MAX / 2) { + let key = i.to_string(); + nested.insert(key, "I am a text!"); + } + let mut big_object = std::collections::HashMap::new(); + big_object.insert("id".to_owned(), "wow"); + + let documents = json!([big_object]); + + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"202 Accepted"); + + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); + snapshot!(response, + @r###" + { + "uid": 1, + "indexUid": "test", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let mut nested = std::collections::HashMap::new(); + for i in 0..(u16::MAX / 2) { + let key = i.to_string(); + nested.insert(key, "I am a text!"); + } + let mut big_object = std::collections::HashMap::new(); + big_object.insert("id".to_owned(), "wow"); + + let documents = json!([big_object]); + + let (response, code) = index.update_documents(documents, Some("id")).await; + snapshot!(code, @"202 Accepted"); + + let response = index.wait_task(response.uid()).await; + snapshot!(code, @"202 Accepted"); + snapshot!(response, + @r###" + { + "uid": 2, + "indexUid": "test", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); } #[actix_rt::test] From fde209b7b6fa6c7e9be9022b86af6f8859ce1aee Mon Sep 17 00:00:00 2001 From: curquiza Date: Tue, 12 Mar 2024 10:20:07 +0100 Subject: [PATCH 14/32] Update cargo version --- Cargo.lock | 34 +++++++++++++++++----------------- Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24540c455..bdca7e24c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -494,7 +494,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "benchmarks" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "bytes", @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "time", @@ -1529,7 +1529,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "big_s", @@ -1767,7 +1767,7 @@ dependencies = [ [[package]] name = "file-store" -version = "1.7.0" +version = "1.8.0" dependencies = [ "faux", "tempfile", @@ -1790,7 +1790,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.7.0" +version = "1.8.0" dependencies = [ "insta", "nom", @@ -1810,7 +1810,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.7.0" +version = "1.8.0" dependencies = [ "criterion", "serde_json", @@ -1928,7 +1928,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.7.0" +version = "1.8.0" dependencies = [ "arbitrary", "clap", @@ -2422,7 +2422,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" [[package]] name = "index-scheduler" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "big_s", @@ -2609,7 +2609,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.7.0" +version = "1.8.0" dependencies = [ "criterion", "serde_json", @@ -3118,7 +3118,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.7.0" +version = "1.8.0" dependencies = [ "insta", "md5", @@ -3127,7 +3127,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.7.0" +version = "1.8.0" dependencies = [ "actix-cors", "actix-http", @@ -3220,7 +3220,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.7.0" +version = "1.8.0" dependencies = [ "base64 0.21.7", "enum-iterator", @@ -3239,7 +3239,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.7.0" +version = "1.8.0" dependencies = [ "actix-web", "anyhow", @@ -3269,7 +3269,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "clap", @@ -3308,7 +3308,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.7.0" +version = "1.8.0" dependencies = [ "arroy", "big_s", @@ -3750,7 +3750,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.7.0" +version = "1.8.0" dependencies = [ "big_s", "serde_json", @@ -5876,7 +5876,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.7.0" +version = "1.8.0" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index 1d79fd196..1d0e0ca0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ members = [ ] [workspace.package] -version = "1.7.0" +version = "1.8.0" authors = [ "Quentin de Quelen ", "ClĂ©ment Renault ", From 7b670a4afadb132ac4a01b6403108700501a391d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 8 Mar 2024 21:28:08 +0100 Subject: [PATCH 15/32] Allow dry runs for benchmarks where reports are generated but not sent to the dashboard --- xtask/src/bench/dashboard.rs | 312 +++++++++++++++++++---------------- xtask/src/bench/mod.rs | 24 +-- xtask/src/bench/workload.rs | 16 +- 3 files changed, 189 insertions(+), 163 deletions(-) diff --git a/xtask/src/bench/dashboard.rs b/xtask/src/bench/dashboard.rs index 833426207..3ba0ca58b 100644 --- a/xtask/src/bench/dashboard.rs +++ b/xtask/src/bench/dashboard.rs @@ -11,157 +11,179 @@ use super::client::Client; use super::env_info; use super::workload::Workload; -pub async fn cancel_on_ctrl_c( - invocation_uuid: Uuid, - dashboard_client: Client, - abort_handle: AbortHandle, -) { - tracing::info!("press Ctrl-C to cancel the invocation"); - match ctrl_c().await { - Ok(()) => { - tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); - mark_as_failed(dashboard_client, invocation_uuid, None).await; - abort_handle.abort(); +#[derive(Debug, Clone)] +pub enum DashboardClient { + Client(Client), + Dry, +} + +impl DashboardClient { + pub fn new(dashboard_url: &str, api_key: Option<&str>) -> anyhow::Result { + let dashboard_client = Client::new( + Some(format!("{}/api/v1", dashboard_url)), + api_key, + Some(std::time::Duration::from_secs(60)), + )?; + + Ok(Self::Client(dashboard_client)) + } + + pub fn new_dry() -> Self { + Self::Dry + } + + pub async fn send_machine_info(&self, env: &env_info::Environment) -> anyhow::Result<()> { + let Self::Client(dashboard_client) = self else { return Ok(()) }; + + let response = dashboard_client + .put("machine") + .json(&json!({"hostname": env.hostname})) + .send() + .await + .context("sending machine information")?; + if !response.status().is_success() { + bail!( + "could not send machine information: {} {}", + response.status(), + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); } - Err(error) => tracing::warn!( - error = &error as &dyn std::error::Error, - "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" - ), + Ok(()) } -} -pub async fn mark_as_failed( - dashboard_client: Client, - invocation_uuid: Uuid, - failure_reason: Option, -) { - let response = dashboard_client - .post("cancel-invocation") - .json(&json!({ - "invocation_uuid": invocation_uuid, - "failure_reason": failure_reason, - })) - .send() - .await; - let response = match response { - Ok(response) => response, - Err(response_error) => { - tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); - return; + pub async fn create_invocation( + &self, + build_info: build_info::BuildInfo, + commit_message: &str, + env: env_info::Environment, + max_workloads: usize, + reason: Option<&str>, + ) -> anyhow::Result { + let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; + + let response = dashboard_client + .put("invocation") + .json(&json!({ + "commit": { + "sha1": build_info.commit_sha1, + "message": commit_message, + "commit_date": build_info.commit_timestamp, + "branch": build_info.branch, + "tag": build_info.describe.and_then(|describe| describe.as_tag()), + }, + "machine_hostname": env.hostname, + "max_workloads": max_workloads, + "reason": reason + })) + .send() + .await + .context("sending invocation")?; + if !response.status().is_success() { + bail!( + "could not send new invocation: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); } - }; - - if !response.status().is_success() { - tracing::error!( - %invocation_uuid, - "could not mark invocation as failed: {}", - response.text().await.unwrap() - ); - return; - } - tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); -} - -pub async fn send_machine_info( - dashboard_client: &Client, - env: &env_info::Environment, -) -> anyhow::Result<()> { - let response = dashboard_client - .put("machine") - .json(&json!({"hostname": env.hostname})) - .send() - .await - .context("sending machine information")?; - if !response.status().is_success() { - bail!( - "could not send machine information: {} {}", - response.status(), - response.text().await.unwrap_or_else(|_| "unknown".into()) - ); - } - Ok(()) -} - -pub async fn create_invocation( - dashboard_client: &Client, - build_info: build_info::BuildInfo, - commit_message: &str, - env: env_info::Environment, - max_workloads: usize, - reason: Option<&str>, -) -> anyhow::Result { - let response = dashboard_client - .put("invocation") - .json(&json!({ - "commit": { - "sha1": build_info.commit_sha1, - "message": commit_message, - "commit_date": build_info.commit_timestamp, - "branch": build_info.branch, - "tag": build_info.describe.and_then(|describe| describe.as_tag()), - }, - "machine_hostname": env.hostname, - "max_workloads": max_workloads, - "reason": reason - })) - .send() - .await - .context("sending invocation")?; - if !response.status().is_success() { - bail!( - "could not send new invocation: {}", - response.text().await.unwrap_or_else(|_| "unknown".into()) - ); - } - let invocation_uuid: Uuid = - response.json().await.context("could not deserialize invocation response as JSON")?; - Ok(invocation_uuid) -} - -pub async fn create_workload( - dashboard_client: &Client, - invocation_uuid: Uuid, - workload: &Workload, -) -> anyhow::Result { - let response = dashboard_client - .put("workload") - .json(&json!({ - "invocation_uuid": invocation_uuid, - "name": &workload.name, - "max_runs": workload.run_count, - })) - .send() - .await - .context("could not create new workload")?; - - if !response.status().is_success() { - bail!("creating new workload failed: {}", response.text().await.unwrap()) + let invocation_uuid: Uuid = + response.json().await.context("could not deserialize invocation response as JSON")?; + Ok(invocation_uuid) } - let workload_uuid: Uuid = - response.json().await.context("could not deserialize JSON as UUID")?; - Ok(workload_uuid) -} + pub async fn create_workload( + &self, + invocation_uuid: Uuid, + workload: &Workload, + ) -> anyhow::Result { + let Self::Client(dashboard_client) = self else { return Ok(Uuid::now_v7()) }; -pub async fn create_run( - dashboard_client: Client, - workload_uuid: Uuid, - report: &BTreeMap, -) -> anyhow::Result<()> { - let response = dashboard_client - .put("run") - .json(&json!({ - "workload_uuid": workload_uuid, - "data": report - })) - .send() - .await - .context("sending new run")?; - if !response.status().is_success() { - bail!( - "sending new run failed: {}", - response.text().await.unwrap_or_else(|_| "unknown".into()) - ) + let response = dashboard_client + .put("workload") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "name": &workload.name, + "max_runs": workload.run_count, + })) + .send() + .await + .context("could not create new workload")?; + + if !response.status().is_success() { + bail!("creating new workload failed: {}", response.text().await.unwrap()) + } + + let workload_uuid: Uuid = + response.json().await.context("could not deserialize JSON as UUID")?; + Ok(workload_uuid) + } + + pub async fn create_run( + &self, + workload_uuid: Uuid, + report: &BTreeMap, + ) -> anyhow::Result<()> { + let Self::Client(dashboard_client) = self else { return Ok(()) }; + + let response = dashboard_client + .put("run") + .json(&json!({ + "workload_uuid": workload_uuid, + "data": report + })) + .send() + .await + .context("sending new run")?; + if !response.status().is_success() { + bail!( + "sending new run failed: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ) + } + Ok(()) + } + + pub async fn cancel_on_ctrl_c(self, invocation_uuid: Uuid, abort_handle: AbortHandle) { + tracing::info!("press Ctrl-C to cancel the invocation"); + match ctrl_c().await { + Ok(()) => { + tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); + self.mark_as_failed(invocation_uuid, None).await; + abort_handle.abort(); + } + Err(error) => tracing::warn!( + error = &error as &dyn std::error::Error, + "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" + ), + } + } + + pub async fn mark_as_failed(&self, invocation_uuid: Uuid, failure_reason: Option) { + if let DashboardClient::Client(client) = self { + let response = client + .post("cancel-invocation") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "failure_reason": failure_reason, + })) + .send() + .await; + let response = match response { + Ok(response) => response, + Err(response_error) => { + tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); + return; + } + }; + + if !response.status().is_success() { + tracing::error!( + %invocation_uuid, + "could not mark invocation as failed: {}", + response.text().await.unwrap() + ); + return; + } + } + + tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); } - Ok(()) } diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs index 62c11b604..844b64f63 100644 --- a/xtask/src/bench/mod.rs +++ b/xtask/src/bench/mod.rs @@ -50,6 +50,10 @@ pub struct BenchDeriveArgs { #[arg(long, default_value_t = default_dashboard_url())] dashboard_url: String, + /// Don't actually send results to the dashboard + #[arg(long)] + no_dashboard: bool, + /// Directory to output reports. #[arg(long, default_value_t = default_report_folder())] report_folder: String, @@ -103,11 +107,11 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let assets_client = Client::new(None, args.assets_key.as_deref(), Some(std::time::Duration::from_secs(3600)))?; // 1h - let dashboard_client = Client::new( - Some(format!("{}/api/v1", args.dashboard_url)), - args.api_key.as_deref(), - Some(std::time::Duration::from_secs(60)), - )?; + let dashboard_client = if args.no_dashboard { + dashboard::DashboardClient::new_dry() + } else { + dashboard::DashboardClient::new(&args.dashboard_url, args.api_key.as_deref())? + }; // reporting uses its own client because keeping the stream open to wait for entries // blocks any other requests @@ -127,12 +131,12 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { // enter runtime rt.block_on(async { - dashboard::send_machine_info(&dashboard_client, &env).await?; + dashboard_client.send_machine_info(&env).await?; let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); - let invocation_uuid = dashboard::create_invocation(&dashboard_client, build_info, commit_message, env, max_workloads, reason).await?; + let invocation_uuid = dashboard_client.create_invocation( build_info, commit_message, env, max_workloads, reason).await?; tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); @@ -167,7 +171,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let abort_handle = workload_runs.abort_handle(); tokio::spawn({ let dashboard_client = dashboard_client.clone(); - dashboard::cancel_on_ctrl_c(invocation_uuid, dashboard_client, abort_handle) + dashboard_client.cancel_on_ctrl_c(invocation_uuid, abort_handle) }); // wait for the end of the main task, handle result @@ -178,7 +182,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { } Ok(Err(error)) => { tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); - dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await; + dashboard_client.mark_as_failed(invocation_uuid, Some(error.to_string())).await; tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); Err(error) }, @@ -186,7 +190,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { match join_error.try_into_panic() { Ok(panic) => { tracing::error!("invocation panicked, attempting to report the failure to dashboard"); - dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await; + dashboard_client.mark_as_failed( invocation_uuid, Some("Panicked".into())).await; std::panic::resume_unwind(panic) } Err(_) => { diff --git a/xtask/src/bench/workload.rs b/xtask/src/bench/workload.rs index b3e952f29..d82c5ad19 100644 --- a/xtask/src/bench/workload.rs +++ b/xtask/src/bench/workload.rs @@ -12,8 +12,9 @@ use uuid::Uuid; use super::assets::Asset; use super::client::Client; use super::command::SyncMode; +use super::dashboard::DashboardClient; use super::BenchDeriveArgs; -use crate::bench::{assets, dashboard, meili_process}; +use crate::bench::{assets, meili_process}; #[derive(Deserialize)] pub struct Workload { @@ -25,7 +26,7 @@ pub struct Workload { } async fn run_commands( - dashboard_client: &Client, + dashboard_client: &DashboardClient, logs_client: &Client, meili_client: &Client, workload_uuid: Uuid, @@ -64,7 +65,7 @@ async fn run_commands( #[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))] pub async fn execute( assets_client: &Client, - dashboard_client: &Client, + dashboard_client: &DashboardClient, logs_client: &Client, meili_client: &Client, invocation_uuid: Uuid, @@ -74,8 +75,7 @@ pub async fn execute( ) -> anyhow::Result<()> { assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; - let workload_uuid = - dashboard::create_workload(dashboard_client, invocation_uuid, &workload).await?; + let workload_uuid = dashboard_client.create_workload(invocation_uuid, &workload).await?; let mut tasks = Vec::new(); @@ -113,7 +113,7 @@ pub async fn execute( #[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner #[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))] async fn execute_run( - dashboard_client: &Client, + dashboard_client: &DashboardClient, logs_client: &Client, meili_client: &Client, workload_uuid: Uuid, @@ -202,7 +202,7 @@ async fn start_report( } async fn stop_report( - dashboard_client: &Client, + dashboard_client: &DashboardClient, logs_client: &Client, workload_uuid: Uuid, filename: String, @@ -232,7 +232,7 @@ async fn stop_report( .context("could not convert trace to report")?; let context = || format!("writing report to {filename}"); - dashboard::create_run(dashboard_client, workload_uuid, &report).await?; + dashboard_client.create_run(workload_uuid, &report).await?; let mut output_file = std::io::BufWriter::new( std::fs::File::options() From 69c118ef766f19745ba8800c508e3a1cf686288b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 12 Mar 2024 10:35:39 +0100 Subject: [PATCH 16/32] Extract the facet order before extracting the facets values --- meilisearch/src/search.rs | 9 ++++++++- milli/src/search/mod.rs | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 27de36c6d..6e253baad 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -726,8 +726,15 @@ pub fn perform_facet_search( let rtxn = index.read_txn()?; let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?; + let sort_by = { + let sorts = index.sort_facet_values_by(&rtxn)?; + sorts + .get(&facet_name) + .copied() + .unwrap_or_else(|| sorts.get("*").copied().unwrap_or_default()) + }; let mut facet_search = - SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some()); + SearchForFacetValues::new(facet_name, search, sort_by, search_query.hybrid.is_some()); if let Some(facet_query) = &facet_query { facet_search.query(facet_query); } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e411bd032..2b0dbe423 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -307,6 +307,7 @@ pub struct SearchForFacetValues<'a> { facet: String, search_query: Search<'a>, max_values: usize, + sort_by: OrderBy, is_hybrid: bool, } @@ -314,6 +315,7 @@ impl<'a> SearchForFacetValues<'a> { pub fn new( facet: String, search_query: Search<'a>, + sort_by: OrderBy, is_hybrid: bool, ) -> SearchForFacetValues<'a> { SearchForFacetValues { @@ -321,6 +323,7 @@ impl<'a> SearchForFacetValues<'a> { facet, search_query, max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, + sort_by, is_hybrid, } } From d3a95ea2f66ae90f62385b9b52bf39f66358cf19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 12 Mar 2024 11:01:46 +0100 Subject: [PATCH 17/32] Introduce a new OrderByMap struct to simplify the sort by usage --- meilisearch/src/search.rs | 27 ++++------------- milli/src/index.rs | 14 ++++----- milli/src/lib.rs | 1 + milli/src/order_by_map.rs | 57 ++++++++++++++++++++++++++++++++++++ milli/src/update/settings.rs | 7 +++-- 5 files changed, 73 insertions(+), 33 deletions(-) create mode 100644 milli/src/order_by_map.rs diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 6e253baad..8f3df04e0 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -671,27 +671,16 @@ pub fn perform_search( let sort_facet_values_by = index.sort_facet_values_by(&rtxn).map_err(milli::Error::from)?; - let default_sort_facet_values_by = - sort_facet_values_by.get("*").copied().unwrap_or_default(); if fields.iter().all(|f| f != "*") { - let fields: Vec<_> = fields - .iter() - .map(|n| { - ( - n, - sort_facet_values_by - .get(n) - .copied() - .unwrap_or(default_sort_facet_values_by), - ) - }) - .collect(); + let fields: Vec<_> = + fields.iter().map(|n| (n, sort_facet_values_by.get(n))).collect(); facet_distribution.facets(fields); } + let distribution = facet_distribution .candidates(candidates) - .default_order_by(default_sort_facet_values_by) + .default_order_by(sort_facet_values_by.get("*")) .execute()?; let stats = facet_distribution.compute_stats()?; (Some(distribution), Some(stats)) @@ -726,13 +715,7 @@ pub fn perform_facet_search( let rtxn = index.read_txn()?; let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?; - let sort_by = { - let sorts = index.sort_facet_values_by(&rtxn)?; - sorts - .get(&facet_name) - .copied() - .unwrap_or_else(|| sorts.get("*").copied().unwrap_or_default()) - }; + let sort_by = index.sort_facet_values_by(&rtxn)?.get(&facet_name); let mut facet_search = SearchForFacetValues::new(facet_name, search, sort_by, search_query.hybrid.is_some()); if let Some(facet_query) = &facet_query { diff --git a/milli/src/index.rs b/milli/src/index.rs index 6ad39dcb1..2c3977403 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -20,13 +20,13 @@ use crate::heed_codec::facet::{ use crate::heed_codec::{ BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, }; +use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - OrderBy, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, - BEU32, BEU64, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -1373,21 +1373,19 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::MAX_VALUES_PER_FACET) } - pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result> { - let mut orders = self + pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result { + let orders = self .main - .remap_types::>>() + .remap_types::>() .get(txn, main_key::SORT_FACET_VALUES_BY)? .unwrap_or_default(); - // Insert the default ordering if it is not already overwritten by the user. - orders.entry("*".to_string()).or_insert(OrderBy::Lexicographic); Ok(orders) } pub(crate) fn put_sort_facet_values_by( &self, txn: &mut RwTxn, - val: &HashMap, + val: &OrderByMap, ) -> heed::Result<()> { self.main.remap_types::>().put(txn, main_key::SORT_FACET_VALUES_BY, &val) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f6b398304..be79a7e86 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -16,6 +16,7 @@ pub mod facet; mod fields_ids_map; pub mod heed_codec; pub mod index; +pub mod order_by_map; pub mod prompt; pub mod proximity; pub mod score_details; diff --git a/milli/src/order_by_map.rs b/milli/src/order_by_map.rs new file mode 100644 index 000000000..287e62c3a --- /dev/null +++ b/milli/src/order_by_map.rs @@ -0,0 +1,57 @@ +use std::collections::{hash_map, HashMap}; +use std::iter::FromIterator; + +use serde::{Deserialize, Deserializer, Serialize}; + +use crate::OrderBy; + +#[derive(Serialize)] +pub struct OrderByMap(HashMap); + +impl OrderByMap { + pub fn get(&self, key: impl AsRef) -> OrderBy { + self.0 + .get(key.as_ref()) + .copied() + .unwrap_or_else(|| self.0.get("*").copied().unwrap_or_default()) + } + + pub fn insert(&mut self, key: String, value: OrderBy) -> Option { + self.0.insert(key, value) + } +} + +impl Default for OrderByMap { + fn default() -> Self { + let mut map = HashMap::new(); + map.insert("*".to_string(), OrderBy::Lexicographic); + OrderByMap(map) + } +} + +impl FromIterator<(String, OrderBy)> for OrderByMap { + fn from_iter>(iter: T) -> Self { + OrderByMap(iter.into_iter().collect()) + } +} + +impl IntoIterator for OrderByMap { + type Item = (String, OrderBy); + type IntoIter = hash_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl<'de> Deserialize<'de> for OrderByMap { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let mut map = Deserialize::deserialize(deserializer).map(OrderByMap)?; + // Insert the default ordering if it is not already overwritten by the user. + map.0.entry("*".to_string()).or_insert(OrderBy::default()); + Ok(map) + } +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3cad79467..dcf41970e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,12 +14,13 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldsIdsMap, Index, OrderBy, Result}; +use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -145,7 +146,7 @@ pub struct Settings<'a, 't, 'i> { /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, max_values_per_facet: Setting, - sort_facet_values_by: Setting>, + sort_facet_values_by: Setting, pagination_max_total_hits: Setting, proximity_precision: Setting, embedder_settings: Setting>>, @@ -340,7 +341,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.max_values_per_facet = Setting::Reset; } - pub fn set_sort_facet_values_by(&mut self, value: HashMap) { + pub fn set_sort_facet_values_by(&mut self, value: OrderByMap) { self.sort_facet_values_by = Setting::Set(value); } From 88bc9556a93009d780f48a37fed88e2187b96ffc Mon Sep 17 00:00:00 2001 From: Jakob Klemm Date: Tue, 12 Mar 2024 19:59:11 +0100 Subject: [PATCH 18/32] Add Ollama dimension inference and add clearer errors Instead of the user manually specifying the model dimensions it will now automatically get determined Just like with hf.rs the word "test" gets embedded to determine the dimensions of the output Add a dedicated error type for if the model doesn't exist (don't automatically pull it though) and set the fault of that error to be the user --- milli/src/update/settings.rs | 4 +- milli/src/vector/error.rs | 15 +++++- milli/src/vector/ollama.rs | 100 ++++++++++++++++++++++++++--------- milli/src/vector/settings.rs | 13 ++--- 4 files changed, 96 insertions(+), 36 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index df273b023..ee2f58a01 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1179,8 +1179,8 @@ pub fn validate_embedding_settings( } } EmbedderSource::Ollama => { - // Existence & corrent dimensions of models cannot easily be checked here. - check_set(&dimensions, "dimensions", inferred_source, name)?; + // Dimensions get inferred, only model name is required + check_unset(&dimensions, "dimensions", inferred_source, name)?; check_set(&model, "model", inferred_source, name)?; check_unset(&api_key, "apiKey", inferred_source, name)?; check_unset(&revision, "revision", inferred_source, name)?; diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index ffdda42ca..3f4d5eb51 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -79,6 +79,8 @@ pub enum EmbedErrorKind { OllamaTooManyRequests(OllamaError), #[error("received internal error from Ollama: {0}")] OllamaInternalServerError(OllamaError), + #[error("model not found. MeiliSearch will not automatically download models from the Ollama library, please pull the model manually: {0}")] + OllamaModelNotFoundError(OllamaError), #[error("received unhandled HTTP status code {0} from Ollama")] OllamaUnhandledStatusCode(u16), } @@ -140,10 +142,14 @@ impl EmbedError { Self { kind: EmbedErrorKind::InitWebClient(inner), fault: FaultSource::Runtime } } - pub fn ollama_unexpected(inner: reqwest::Error) -> EmbedError { + pub(crate) fn ollama_unexpected(inner: reqwest::Error) -> EmbedError { Self { kind: EmbedErrorKind::OllamaUnexpected(inner), fault: FaultSource::Bug } } + pub(crate) fn ollama_model_not_found(inner: OllamaError) -> EmbedError { + Self { kind: EmbedErrorKind::OllamaModelNotFoundError(inner), fault: FaultSource::User } + } + pub(crate) fn ollama_too_many_requests(inner: OllamaError) -> EmbedError { Self { kind: EmbedErrorKind::OllamaTooManyRequests(inner), fault: FaultSource::Runtime } } @@ -221,6 +227,13 @@ impl NewEmbedderError { } } + pub fn ollama_could_not_determine_dimension(inner: EmbedError) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CouldNotDetermineDimension(inner), + fault: FaultSource::User, + } + } + pub fn openai_invalid_api_key_format(inner: reqwest::header::InvalidHeaderValue) -> Self { Self { kind: NewEmbedderErrorKind::InvalidApiKeyFormat(inner), fault: FaultSource::User } } diff --git a/milli/src/vector/ollama.rs b/milli/src/vector/ollama.rs index a83022dbd..76988f70b 100644 --- a/milli/src/vector/ollama.rs +++ b/milli/src/vector/ollama.rs @@ -18,7 +18,6 @@ pub struct Embedder { #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] pub struct EmbedderOptions { pub embedding_model: EmbeddingModel, - pub dimensions: usize, } #[derive( @@ -27,6 +26,7 @@ pub struct EmbedderOptions { #[deserr(deny_unknown_fields)] pub struct EmbeddingModel { name: String, + dimensions: usize, } #[derive(Debug, serde::Serialize)] @@ -40,16 +40,9 @@ struct OllamaResponse { embedding: Embedding, } -#[derive(Debug, serde::Deserialize)] -struct OllamaErrorResponse { - error: OllamaError, -} - #[derive(Debug, serde::Deserialize)] pub struct OllamaError { - message: String, - // type: String, - code: Option, + error: String, } impl EmbeddingModel { @@ -68,7 +61,7 @@ impl EmbeddingModel { } pub fn from_name(name: &str) -> Self { - Self { name: name.to_string() } + Self { name: name.to_string(), dimensions: 0 } } pub fn supports_overriding_dimensions(&self) -> bool { @@ -78,17 +71,17 @@ impl EmbeddingModel { impl Default for EmbeddingModel { fn default() -> Self { - Self { name: "nomic-embed-text".to_string() } + Self { name: "nomic-embed-text".to_string(), dimensions: 0 } } } impl EmbedderOptions { pub fn with_default_model() -> Self { - Self { embedding_model: Default::default(), dimensions: 768 } + Self { embedding_model: Default::default() } } - pub fn with_embedding_model(embedding_model: EmbeddingModel, dimensions: usize) -> Self { - Self { embedding_model, dimensions } + pub fn with_embedding_model(embedding_model: EmbeddingModel) -> Self { + Self { embedding_model } } } @@ -107,7 +100,58 @@ impl Embedder { reqwest::header::HeaderValue::from_static("application/json"), ); - Ok(Self { options, headers }) + let mut embedder = Self { options, headers }; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_io() + .enable_time() + .build() + .map_err(EmbedError::openai_runtime_init) + .map_err(NewEmbedderError::ollama_could_not_determine_dimension)?; + + // Get dimensions from Ollama + let request = + OllamaRequest { model: &embedder.options.embedding_model.name(), prompt: "test" }; + // TODO: Refactor into shared error type + let client = embedder + .new_client() + .map_err(NewEmbedderError::ollama_could_not_determine_dimension)?; + + rt.block_on(async move { + let response = client + .post(get_ollama_path()) + .json(&request) + .send() + .await + .map_err(EmbedError::ollama_unexpected) + .map_err(NewEmbedderError::ollama_could_not_determine_dimension)?; + + // Process error in case model not found + let response = Self::check_response(response).await.map_err(|_err| { + let e = EmbedError::ollama_model_not_found(OllamaError { + error: format!("model: {}", embedder.options.embedding_model.name()), + }); + NewEmbedderError::ollama_could_not_determine_dimension(e) + })?; + + let response: OllamaResponse = response + .json() + .await + .map_err(EmbedError::ollama_unexpected) + .map_err(NewEmbedderError::ollama_could_not_determine_dimension)?; + + let embedding = Embeddings::from_single_embedding(response.embedding); + + embedder.options.embedding_model.dimensions = embedding.dimension(); + + tracing::info!( + "ollama model {} with dimensionality {} added", + embedder.options.embedding_model.name(), + embedding.dimension() + ); + + Ok(embedder) + }) } async fn check_response(response: reqwest::Response) -> Result { @@ -115,26 +159,37 @@ impl Embedder { // Not the same number of possible error cases covered as with OpenAI. match response.status() { StatusCode::TOO_MANY_REQUESTS => { - let error_response: OllamaErrorResponse = response + let error_response: OllamaError = response .json() .await .map_err(EmbedError::ollama_unexpected) .map_err(Retry::retry_later)?; return Err(Retry::rate_limited(EmbedError::ollama_too_many_requests( - error_response.error, + OllamaError { error: error_response.error }, ))); } StatusCode::SERVICE_UNAVAILABLE => { - let error_response: OllamaErrorResponse = response + let error_response: OllamaError = response .json() .await .map_err(EmbedError::ollama_unexpected) .map_err(Retry::retry_later)?; return Err(Retry::retry_later(EmbedError::ollama_internal_server_error( - error_response.error, + OllamaError { error: error_response.error }, ))); } + StatusCode::NOT_FOUND => { + let error_response: OllamaError = response + .json() + .await + .map_err(EmbedError::ollama_unexpected) + .map_err(Retry::give_up)?; + + return Err(Retry::give_up(EmbedError::ollama_model_not_found(OllamaError { + error: error_response.error, + }))); + } code => { return Err(Retry::give_up(EmbedError::ollama_unhandled_status_code( code.as_u16(), @@ -232,7 +287,7 @@ impl Embedder { } pub fn dimensions(&self) -> usize { - self.options.dimensions + self.options.embedding_model.dimensions } pub fn distribution(&self) -> Option { @@ -242,10 +297,7 @@ impl Embedder { impl Display for OllamaError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match &self.code { - Some(code) => write!(f, "{} ({})", self.message, code), - None => write!(f, "{}", self.message), - } + write!(f, "{}", self.error) } } diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 5595f60e3..84d58a996 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -85,9 +85,7 @@ impl EmbeddingSettings { } Self::REVISION => &[EmbedderSource::HuggingFace], Self::API_KEY => &[EmbedderSource::OpenAi], - Self::DIMENSIONS => { - &[EmbedderSource::OpenAi, EmbedderSource::UserProvided, EmbedderSource::Ollama] - } + Self::DIMENSIONS => &[EmbedderSource::OpenAi, EmbedderSource::UserProvided], Self::DOCUMENT_TEMPLATE => { &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama] } @@ -107,9 +105,7 @@ impl EmbeddingSettings { EmbedderSource::HuggingFace => { &[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE] } - EmbedderSource::Ollama => { - &[Self::SOURCE, Self::MODEL, Self::DIMENSIONS, Self::DOCUMENT_TEMPLATE] - } + EmbedderSource::Ollama => &[Self::SOURCE, Self::MODEL, Self::DOCUMENT_TEMPLATE], EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS], } } @@ -211,7 +207,7 @@ impl From for EmbeddingSettings { model: Setting::Set(options.embedding_model.name().to_owned()), revision: Setting::NotSet, api_key: Setting::NotSet, - dimensions: Setting::Set(options.dimensions), + dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), }, super::EmbedderOptions::UserProvided(options) => Self { @@ -251,9 +247,8 @@ impl From for EmbeddingConfig { EmbedderSource::Ollama => { let mut options: ollama::EmbedderOptions = super::ollama::EmbedderOptions::with_default_model(); - if let (Some(model), Some(dim)) = (model.set(), dimensions.set()) { + if let Some(model) = model.set() { options.embedding_model = super::ollama::EmbeddingModel::from_name(&model); - options.dimensions = dim; } this.embedder_options = super::EmbedderOptions::Ollama(options); } From ae67d5eef097b1e722b4dc101bc8b696149d3096 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 13 Mar 2024 09:45:04 +0100 Subject: [PATCH 19/32] Update milli/src/vector/error.rs Fix Meilisearch capitalization --- milli/src/vector/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 3f4d5eb51..20bf49a6b 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -79,7 +79,7 @@ pub enum EmbedErrorKind { OllamaTooManyRequests(OllamaError), #[error("received internal error from Ollama: {0}")] OllamaInternalServerError(OllamaError), - #[error("model not found. MeiliSearch will not automatically download models from the Ollama library, please pull the model manually: {0}")] + #[error("model not found. Meilisearch will not automatically download models from the Ollama library, please pull the model manually: {0}")] OllamaModelNotFoundError(OllamaError), #[error("received unhandled HTTP status code {0} from Ollama")] OllamaUnhandledStatusCode(u16), From 9f7a4fbfeb4d5acaa6fef10e27753cb1530256e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 12 Mar 2024 18:04:38 +0100 Subject: [PATCH 20/32] Return the facets of a placeholder facet-search sorted by count --- meilisearch/src/search.rs | 3 +- milli/src/search/mod.rs | 80 +++++++++++++++++++++++++++++++-------- 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 8f3df04e0..d98e96e87 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -715,9 +715,8 @@ pub fn perform_facet_search( let rtxn = index.read_txn()?; let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?; - let sort_by = index.sort_facet_values_by(&rtxn)?.get(&facet_name); let mut facet_search = - SearchForFacetValues::new(facet_name, search, sort_by, search_query.hybrid.is_some()); + SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some()); if let Some(facet_query) = &facet_query { facet_search.query(facet_query); } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2b0dbe423..fbd76613e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,3 +1,5 @@ +use std::cmp::{Ordering, Reverse}; +use std::collections::BinaryHeap; use std::fmt; use std::ops::ControlFlow; @@ -307,7 +309,6 @@ pub struct SearchForFacetValues<'a> { facet: String, search_query: Search<'a>, max_values: usize, - sort_by: OrderBy, is_hybrid: bool, } @@ -315,7 +316,6 @@ impl<'a> SearchForFacetValues<'a> { pub fn new( facet: String, search_query: Search<'a>, - sort_by: OrderBy, is_hybrid: bool, ) -> SearchForFacetValues<'a> { SearchForFacetValues { @@ -323,7 +323,6 @@ impl<'a> SearchForFacetValues<'a> { facet, search_query, max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, - sort_by, is_hybrid, } } @@ -384,6 +383,8 @@ impl<'a> SearchForFacetValues<'a> { .search_query .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; + let sort_by = index.sort_facet_values_by(rtxn)?.get(&self.facet); + match self.query.as_ref() { Some(query) => { let options = NormalizerOption { lossy: true, ..Default::default() }; @@ -465,23 +466,56 @@ impl<'a> SearchForFacetValues<'a> { } } None => { - let mut results = vec![]; let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - results.push(FacetValueHit { value, count }); + + match sort_by { + OrderBy::Lexicographic => { + let mut results = vec![]; + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + results.push(FacetValueHit { value, count }); + } + if results.len() >= self.max_values { + break; + } + } + Ok(results) } - if results.len() >= self.max_values { - break; + OrderBy::Count => { + let mut top_counts = BinaryHeap::new(); + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + if top_counts.len() >= self.max_values { + top_counts.pop(); + } + // It is a max heap and we need to move the smallest counts at the + // top to be able to pop them when we reach the max_values limit. + top_counts.push(Reverse(FacetValueHit { value, count })); + } + } + + // Convert the heap into a vec of hits by removing the Reverse wrapper. + // Hits are already in the right order as they were reversed and there + // are output in ascending order. + Ok(top_counts + .into_sorted_vec() + .into_iter() + .map(|Reverse(hit)| hit) + .collect()) } } - Ok(results) } } } @@ -539,6 +573,20 @@ pub struct FacetValueHit { pub count: u64, } +impl PartialOrd for FacetValueHit { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FacetValueHit { + fn cmp(&self, other: &Self) -> Ordering { + self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value)) + } +} + +impl Eq for FacetValueHit {} + #[cfg(test)] mod test { #[allow(unused_imports)] From 306b25ad3a7006fd7ac3c9edbb153b1a88a06bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Mar 2024 10:24:21 +0100 Subject: [PATCH 21/32] Move the searchForFacetValues struct into a dedicated module --- milli/src/lib.rs | 6 +- milli/src/search/facet/mod.rs | 3 + milli/src/search/facet/search.rs | 300 +++++++++++++++++++++++++++++++ milli/src/search/mod.rs | 300 +------------------------------ 4 files changed, 308 insertions(+), 301 deletions(-) create mode 100644 milli/src/search/facet/search.rs diff --git a/milli/src/lib.rs b/milli/src/lib.rs index be79a7e86..5effcea3d 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -57,10 +57,10 @@ pub use self::heed_codec::{ UncheckedU8StrStrCodec, }; pub use self::index::Index; +pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::{ - FacetDistribution, FacetValueHit, Filter, FormatOptions, MatchBounds, MatcherBuilder, - MatchingWords, OrderBy, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy, - DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, + Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index f676ee109..34a9cdcb8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -6,15 +6,18 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; +pub use self::search::{FacetValueHit, SearchForFacetValues}; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; + mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod search; fn facet_extreme_value<'t>( mut extreme_it: impl Iterator> + 't, diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs new file mode 100644 index 000000000..39bb74ace --- /dev/null +++ b/milli/src/search/facet/search.rs @@ -0,0 +1,300 @@ +use std::cmp::{Ordering, Reverse}; +use std::collections::BinaryHeap; +use std::ops::ControlFlow; + +use charabia::normalizer::NormalizerOption; +use charabia::Normalize; +use fst::automaton::{Automaton, Str}; +use fst::{IntoStreamer, Streamer}; +use roaring::RoaringBitmap; +use tracing::error; + +use crate::error::UserError; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; +use crate::search::build_dfa; +use crate::{DocumentId, FieldId, OrderBy, Result, Search}; + +/// The maximum number of values per facet returned by the facet search route. +const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; + +pub struct SearchForFacetValues<'a> { + query: Option, + facet: String, + search_query: Search<'a>, + max_values: usize, + is_hybrid: bool, +} + +impl<'a> SearchForFacetValues<'a> { + pub fn new( + facet: String, + search_query: Search<'a>, + is_hybrid: bool, + ) -> SearchForFacetValues<'a> { + SearchForFacetValues { + query: None, + facet, + search_query, + max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, + is_hybrid, + } + } + + pub fn query(&mut self, query: impl Into) -> &mut Self { + self.query = Some(query.into()); + self + } + + pub fn max_values(&mut self, max: usize) -> &mut Self { + self.max_values = max; + self + } + + fn one_original_value_of( + &self, + field_id: FieldId, + facet_str: &str, + any_docid: DocumentId, + ) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + let key: (FieldId, _, &str) = (field_id, any_docid, facet_str); + Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned())) + } + + pub fn execute(&self) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + + let filterable_fields = index.filterable_fields(rtxn)?; + if !filterable_fields.contains(&self.facet) { + let (valid_fields, hidden_fields) = + index.remove_hidden_fields(rtxn, filterable_fields)?; + + return Err(UserError::InvalidFacetSearchFacetName { + field: self.facet.clone(), + valid_fields, + hidden_fields, + } + .into()); + } + + let fields_ids_map = index.fields_ids_map(rtxn)?; + let fid = match fields_ids_map.id(&self.facet) { + Some(fid) => fid, + // we return an empty list of results when the attribute has been + // set as filterable but no document contains this field (yet). + None => return Ok(Vec::new()), + }; + + let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { + Some(fst) => fst, + None => return Ok(vec![]), + }; + + let search_candidates = self + .search_query + .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; + + let sort_by = index.sort_facet_values_by(rtxn)?.get(&self.facet); + + match self.query.as_ref() { + Some(query) => { + let options = NormalizerOption { lossy: true, ..Default::default() }; + let query = query.normalize(&options); + let query = query.as_ref(); + + let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; + let field_authorizes_typos = + !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); + + if authorize_typos && field_authorizes_typos { + let exact_words_fst = self.search_query.index.exact_words(rtxn)?; + if exact_words_fst.map_or(false, |fst| fst.contains(query)) { + let mut results = vec![]; + if fst.contains(query) { + self.fetch_original_facets_using_normalized( + fid, + query, + query, + &search_candidates, + &mut results, + )?; + } + Ok(results) + } else { + let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; + let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; + + let is_prefix = true; + let automaton = if query.len() < one_typo as usize { + build_dfa(query, 0, is_prefix) + } else if query.len() < two_typos as usize { + build_dfa(query, 1, is_prefix) + } else { + build_dfa(query, 2, is_prefix) + }; + + let mut stream = fst.search(automaton).into_stream(); + let mut results = vec![]; + while let Some(facet_value) = stream.next() { + let value = std::str::from_utf8(facet_value)?; + if self + .fetch_original_facets_using_normalized( + fid, + value, + query, + &search_candidates, + &mut results, + )? + .is_break() + { + break; + } + } + + Ok(results) + } + } else { + let automaton = Str::new(query).starts_with(); + let mut stream = fst.search(automaton).into_stream(); + let mut results = vec![]; + while let Some(facet_value) = stream.next() { + let value = std::str::from_utf8(facet_value)?; + if self + .fetch_original_facets_using_normalized( + fid, + value, + query, + &search_candidates, + &mut results, + )? + .is_break() + { + break; + } + } + + Ok(results) + } + } + None => { + let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; + match sort_by { + OrderBy::Lexicographic => { + let mut results = vec![]; + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + results.push(FacetValueHit { value, count }); + } + if results.len() >= self.max_values { + break; + } + } + Ok(results) + } + OrderBy::Count => { + let mut top_counts = BinaryHeap::new(); + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + if top_counts.len() >= self.max_values { + top_counts.pop(); + } + // It is a max heap and we need to move the smallest counts at the + // top to be able to pop them when we reach the max_values limit. + top_counts.push(Reverse(FacetValueHit { value, count })); + } + } + + // Convert the heap into a vec of hits by removing the Reverse wrapper. + // Hits are already in the right order as they were reversed and there + // are output in ascending order. + Ok(top_counts + .into_sorted_vec() + .into_iter() + .map(|Reverse(hit)| hit) + .collect()) + } + } + } + } + } + + fn fetch_original_facets_using_normalized( + &self, + fid: FieldId, + value: &str, + query: &str, + search_candidates: &RoaringBitmap, + results: &mut Vec, + ) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + + let database = index.facet_id_normalized_string_strings; + let key = (fid, value); + let original_strings = match database.get(rtxn, &key)? { + Some(original_strings) => original_strings, + None => { + error!("the facet value is missing from the facet database: {key:?}"); + return Ok(ControlFlow::Continue(())); + } + }; + for original in original_strings { + let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() }; + let docids = match index.facet_id_string_docids.get(rtxn, &key)? { + Some(FacetGroupValue { bitmap, .. }) => bitmap, + None => { + error!("the facet value is missing from the facet database: {key:?}"); + return Ok(ControlFlow::Continue(())); + } + }; + let count = search_candidates.intersection_len(&docids); + if count != 0 { + let value = self + .one_original_value_of(fid, &original, docids.min().unwrap())? + .unwrap_or_else(|| query.to_string()); + results.push(FacetValueHit { value, count }); + } + if results.len() >= self.max_values { + return Ok(ControlFlow::Break(())); + } + } + + Ok(ControlFlow::Continue(())) + } +} + +#[derive(Debug, Clone, serde::Serialize, PartialEq)] +pub struct FacetValueHit { + /// The original facet value + pub value: String, + /// The number of documents associated to this facet + pub count: u64, +} + +impl PartialOrd for FacetValueHit { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FacetValueHit { + fn cmp(&self, other: &Self) -> Ordering { + self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value)) + } +} + +impl Eq for FacetValueHit {} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index fbd76613e..dc8354486 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,27 +1,17 @@ -use std::cmp::{Ordering, Reverse}; -use std::collections::BinaryHeap; use std::fmt; -use std::ops::ControlFlow; -use charabia::normalizer::NormalizerOption; -use charabia::Normalize; -use fst::automaton::{Automaton, Str}; -use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use tracing::error; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; -use crate::error::UserError; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::DistributionShift; use crate::{ - execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, - Result, SearchContext, + execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, + SearchContext, }; // Building these factories is not free. @@ -29,9 +19,6 @@ static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -/// The maximum number of values per facet returned by the facet search route. -const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; - pub mod facet; mod fst_utils; pub mod hybrid; @@ -304,289 +291,6 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { } } -pub struct SearchForFacetValues<'a> { - query: Option, - facet: String, - search_query: Search<'a>, - max_values: usize, - is_hybrid: bool, -} - -impl<'a> SearchForFacetValues<'a> { - pub fn new( - facet: String, - search_query: Search<'a>, - is_hybrid: bool, - ) -> SearchForFacetValues<'a> { - SearchForFacetValues { - query: None, - facet, - search_query, - max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, - is_hybrid, - } - } - - pub fn query(&mut self, query: impl Into) -> &mut Self { - self.query = Some(query.into()); - self - } - - pub fn max_values(&mut self, max: usize) -> &mut Self { - self.max_values = max; - self - } - - fn one_original_value_of( - &self, - field_id: FieldId, - facet_str: &str, - any_docid: DocumentId, - ) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - let key: (FieldId, _, &str) = (field_id, any_docid, facet_str); - Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned())) - } - - pub fn execute(&self) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - - let filterable_fields = index.filterable_fields(rtxn)?; - if !filterable_fields.contains(&self.facet) { - let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, filterable_fields)?; - - return Err(UserError::InvalidFacetSearchFacetName { - field: self.facet.clone(), - valid_fields, - hidden_fields, - } - .into()); - } - - let fields_ids_map = index.fields_ids_map(rtxn)?; - let fid = match fields_ids_map.id(&self.facet) { - Some(fid) => fid, - // we return an empty list of results when the attribute has been - // set as filterable but no document contains this field (yet). - None => return Ok(Vec::new()), - }; - - let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { - Some(fst) => fst, - None => return Ok(vec![]), - }; - - let search_candidates = self - .search_query - .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; - - let sort_by = index.sort_facet_values_by(rtxn)?.get(&self.facet); - - match self.query.as_ref() { - Some(query) => { - let options = NormalizerOption { lossy: true, ..Default::default() }; - let query = query.normalize(&options); - let query = query.as_ref(); - - let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; - let field_authorizes_typos = - !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); - - if authorize_typos && field_authorizes_typos { - let exact_words_fst = self.search_query.index.exact_words(rtxn)?; - if exact_words_fst.map_or(false, |fst| fst.contains(query)) { - let mut results = vec![]; - if fst.contains(query) { - self.fetch_original_facets_using_normalized( - fid, - query, - query, - &search_candidates, - &mut results, - )?; - } - Ok(results) - } else { - let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; - let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; - - let is_prefix = true; - let automaton = if query.len() < one_typo as usize { - build_dfa(query, 0, is_prefix) - } else if query.len() < two_typos as usize { - build_dfa(query, 1, is_prefix) - } else { - build_dfa(query, 2, is_prefix) - }; - - let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; - while let Some(facet_value) = stream.next() { - let value = std::str::from_utf8(facet_value)?; - if self - .fetch_original_facets_using_normalized( - fid, - value, - query, - &search_candidates, - &mut results, - )? - .is_break() - { - break; - } - } - - Ok(results) - } - } else { - let automaton = Str::new(query).starts_with(); - let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; - while let Some(facet_value) = stream.next() { - let value = std::str::from_utf8(facet_value)?; - if self - .fetch_original_facets_using_normalized( - fid, - value, - query, - &search_candidates, - &mut results, - )? - .is_break() - { - break; - } - } - - Ok(results) - } - } - None => { - let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; - - match sort_by { - OrderBy::Lexicographic => { - let mut results = vec![]; - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - break; - } - } - Ok(results) - } - OrderBy::Count => { - let mut top_counts = BinaryHeap::new(); - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - if top_counts.len() >= self.max_values { - top_counts.pop(); - } - // It is a max heap and we need to move the smallest counts at the - // top to be able to pop them when we reach the max_values limit. - top_counts.push(Reverse(FacetValueHit { value, count })); - } - } - - // Convert the heap into a vec of hits by removing the Reverse wrapper. - // Hits are already in the right order as they were reversed and there - // are output in ascending order. - Ok(top_counts - .into_sorted_vec() - .into_iter() - .map(|Reverse(hit)| hit) - .collect()) - } - } - } - } - } - - fn fetch_original_facets_using_normalized( - &self, - fid: FieldId, - value: &str, - query: &str, - search_candidates: &RoaringBitmap, - results: &mut Vec, - ) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - - let database = index.facet_id_normalized_string_strings; - let key = (fid, value); - let original_strings = match database.get(rtxn, &key)? { - Some(original_strings) => original_strings, - None => { - error!("the facet value is missing from the facet database: {key:?}"); - return Ok(ControlFlow::Continue(())); - } - }; - for original in original_strings { - let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() }; - let docids = match index.facet_id_string_docids.get(rtxn, &key)? { - Some(FacetGroupValue { bitmap, .. }) => bitmap, - None => { - error!("the facet value is missing from the facet database: {key:?}"); - return Ok(ControlFlow::Continue(())); - } - }; - let count = search_candidates.intersection_len(&docids); - if count != 0 { - let value = self - .one_original_value_of(fid, &original, docids.min().unwrap())? - .unwrap_or_else(|| query.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - return Ok(ControlFlow::Break(())); - } - } - - Ok(ControlFlow::Continue(())) - } -} - -#[derive(Debug, Clone, serde::Serialize, PartialEq)] -pub struct FacetValueHit { - /// The original facet value - pub value: String, - /// The number of documents associated to this facet - pub count: u64, -} - -impl PartialOrd for FacetValueHit { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for FacetValueHit { - fn cmp(&self, other: &Self) -> Ordering { - self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value)) - } -} - -impl Eq for FacetValueHit {} - #[cfg(test)] mod test { #[allow(unused_imports)] From b918b55c6b7e7856eee3e7359d70bcf4124cce4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Mar 2024 11:06:13 +0100 Subject: [PATCH 22/32] Introduce a new facet value collection wrapper to simply the usage --- milli/src/search/facet/search.rs | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 39bb74ace..504221837 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -6,6 +6,8 @@ use charabia::normalizer::NormalizerOption; use charabia::Normalize; use fst::automaton::{Automaton, Str}; use fst::{IntoStreamer, Streamer}; +use futures::stream::PeekMut; +use itertools::concat; use roaring::RoaringBitmap; use tracing::error; @@ -298,3 +300,68 @@ impl Ord for FacetValueHit { } impl Eq for FacetValueHit {} + +/// A wrapper type that collects the best facet values by +/// lexicographic or number of associated values. +enum ValuesCollection { + /// Keeps the top values according to the lexicographic order. + Lexicographic { max: usize, content: Vec }, + /// Keeps the top values according to the number of values associated to them. + /// + /// Note that it is a max heap and we need to move the smallest counts + /// at the top to be able to pop them when we reach the max_values limit. + Count { max: usize, content: BinaryHeap> }, +} + +impl ValuesCollection { + pub fn new_lexicographic(max: usize) -> Self { + ValuesCollection::Lexicographic { max, content: Vec::with_capacity(max) } + } + + pub fn new_count(max: usize) -> Self { + ValuesCollection::Count { max, content: BinaryHeap::with_capacity(max) } + } + + pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> { + match self { + ValuesCollection::Lexicographic { max, content } => { + if content.len() < *max { + content.push(value); + if content.len() < *max { + return ControlFlow::Continue(()); + } + } + ControlFlow::Break(()) + } + ValuesCollection::Count { max, content } => { + if content.len() == *max { + // Peeking gives us the worst value in the list as + // this is a max-heap and we reversed it. + let Some(mut peek) = content.peek_mut() else { return ControlFlow::Break(()) }; + if peek.0.count <= value.count { + // Replace the current worst value in the heap + // with the new one we received that is better. + *peek = Reverse(value); + } + } else { + content.push(Reverse(value)); + } + ControlFlow::Continue(()) + } + } + } + + /// Returns the list of facet values in descending order of, either, + /// count or lexicographic order of the value depending on the type. + pub fn into_sorted_vec(self) -> Vec { + match self { + ValuesCollection::Lexicographic { content, .. } => content.into_iter().collect(), + ValuesCollection::Count { content, .. } => { + // Convert the heap into a vec of hits by removing the Reverse wrapper. + // Hits are already in the right order as they were reversed and there + // are output in ascending order. + content.into_sorted_vec().into_iter().map(|Reverse(hit)| hit).collect() + } + } + } +} From e0dac5a22f5af7bda383ee144981d2567689280a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Mar 2024 11:13:46 +0100 Subject: [PATCH 23/32] Simplify the algorithm by using the new facet values collection wrapper --- milli/src/search/facet/search.rs | 85 +++++++++----------------------- 1 file changed, 22 insertions(+), 63 deletions(-) diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 504221837..a917bcf7c 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -6,8 +6,6 @@ use charabia::normalizer::NormalizerOption; use charabia::Normalize; use fst::automaton::{Automaton, Str}; use fst::{IntoStreamer, Streamer}; -use futures::stream::PeekMut; -use itertools::concat; use roaring::RoaringBitmap; use tracing::error; @@ -98,7 +96,10 @@ impl<'a> SearchForFacetValues<'a> { .search_query .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; - let sort_by = index.sort_facet_values_by(rtxn)?.get(&self.facet); + let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) { + OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values), + OrderBy::Count => ValuesCollection::by_count(self.max_values), + }; match self.query.as_ref() { Some(query) => { @@ -113,7 +114,6 @@ impl<'a> SearchForFacetValues<'a> { if authorize_typos && field_authorizes_typos { let exact_words_fst = self.search_query.index.exact_words(rtxn)?; if exact_words_fst.map_or(false, |fst| fst.contains(query)) { - let mut results = vec![]; if fst.contains(query) { self.fetch_original_facets_using_normalized( fid, @@ -123,7 +123,6 @@ impl<'a> SearchForFacetValues<'a> { &mut results, )?; } - Ok(results) } else { let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; @@ -138,7 +137,6 @@ impl<'a> SearchForFacetValues<'a> { }; let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; while let Some(facet_value) = stream.next() { let value = std::str::from_utf8(facet_value)?; if self @@ -154,13 +152,10 @@ impl<'a> SearchForFacetValues<'a> { break; } } - - Ok(results) } } else { let automaton = Str::new(query).starts_with(); let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; while let Some(facet_value) = stream.next() { let value = std::str::from_utf8(facet_value)?; if self @@ -176,62 +171,27 @@ impl<'a> SearchForFacetValues<'a> { break; } } - - Ok(results) } } None => { let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; - match sort_by { - OrderBy::Lexicographic => { - let mut results = vec![]; - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - break; - } + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + if results.insert(FacetValueHit { value, count }).is_break() { + break; } - Ok(results) - } - OrderBy::Count => { - let mut top_counts = BinaryHeap::new(); - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - if top_counts.len() >= self.max_values { - top_counts.pop(); - } - // It is a max heap and we need to move the smallest counts at the - // top to be able to pop them when we reach the max_values limit. - top_counts.push(Reverse(FacetValueHit { value, count })); - } - } - - // Convert the heap into a vec of hits by removing the Reverse wrapper. - // Hits are already in the right order as they were reversed and there - // are output in ascending order. - Ok(top_counts - .into_sorted_vec() - .into_iter() - .map(|Reverse(hit)| hit) - .collect()) } } } } + + Ok(results.into_sorted_vec()) } fn fetch_original_facets_using_normalized( @@ -240,7 +200,7 @@ impl<'a> SearchForFacetValues<'a> { value: &str, query: &str, search_candidates: &RoaringBitmap, - results: &mut Vec, + results: &mut ValuesCollection, ) -> Result> { let index = self.search_query.index; let rtxn = self.search_query.rtxn; @@ -268,10 +228,9 @@ impl<'a> SearchForFacetValues<'a> { let value = self .one_original_value_of(fid, &original, docids.min().unwrap())? .unwrap_or_else(|| query.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - return Ok(ControlFlow::Break(())); + if results.insert(FacetValueHit { value, count }).is_break() { + break; + } } } @@ -314,11 +273,11 @@ enum ValuesCollection { } impl ValuesCollection { - pub fn new_lexicographic(max: usize) -> Self { + pub fn by_lexicographic(max: usize) -> Self { ValuesCollection::Lexicographic { max, content: Vec::with_capacity(max) } } - pub fn new_count(max: usize) -> Self { + pub fn by_count(max: usize) -> Self { ValuesCollection::Count { max, content: BinaryHeap::with_capacity(max) } } From 6c9823d7bba6aaac736a26586c775ce0e3a0d4e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Mar 2024 11:57:55 +0100 Subject: [PATCH 24/32] Add tests to sortFacetValuesBy count --- meilisearch/tests/search/facet_search.rs | 43 ++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/meilisearch/tests/search/facet_search.rs b/meilisearch/tests/search/facet_search.rs index 5f9f631f9..12d2226a9 100644 --- a/meilisearch/tests/search/facet_search.rs +++ b/meilisearch/tests/search/facet_search.rs @@ -123,6 +123,28 @@ async fn simple_facet_search_with_max_values() { assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); } +#[actix_rt::test] +async fn simple_facet_search_by_count_with_max_values() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index + .update_settings_faceting( + json!({ "maxValuesPerFacet": 1, "sortFacetValuesBy": { "*": "count" } }), + ) + .await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn non_filterable_facet_search_error() { let server = Server::new().await; @@ -157,3 +179,24 @@ async fn facet_search_dont_support_words() { assert_eq!(code, 200, "{}", response); assert_eq!(response["facetHits"].as_array().unwrap().len(), 0); } + +#[actix_rt::test] +async fn simple_facet_search_with_sort_by_count() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + let hits = response["facetHits"].as_array().unwrap(); + assert_eq!(hits.len(), 2); + assert_eq!(hits[0], json!({ "value": "Action", "count": 3 })); + assert_eq!(hits[1], json!({ "value": "Adventure", "count": 2 })); +} From 6fa387226815d768281dd1e25d9590578f5b547e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 13 Mar 2024 13:45:02 +0100 Subject: [PATCH 25/32] Workflows: Fix reason param when benches are triggered from a comment. --- .github/workflows/bench-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 6f4956542..418a23717 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -43,4 +43,4 @@ jobs: - name: Run benchmarks on PR ${{ github.event.issue.id }} run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.url }}) on [#${{github.event.issue.id}}](${{ github.event.issue.url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file From f3fc2bd01fadfab8fcfc5c8b7d0802e235fa2781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Mar 2024 15:22:14 +0100 Subject: [PATCH 26/32] Address some issues with preallocations --- milli/src/search/facet/search.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index a917bcf7c..0251d6b8d 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -89,7 +89,7 @@ impl<'a> SearchForFacetValues<'a> { let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { Some(fst) => fst, - None => return Ok(vec![]), + None => return Ok(Vec::new()), }; let search_candidates = self @@ -274,11 +274,11 @@ enum ValuesCollection { impl ValuesCollection { pub fn by_lexicographic(max: usize) -> Self { - ValuesCollection::Lexicographic { max, content: Vec::with_capacity(max) } + ValuesCollection::Lexicographic { max, content: Vec::new() } } pub fn by_count(max: usize) -> Self { - ValuesCollection::Count { max, content: BinaryHeap::with_capacity(max) } + ValuesCollection::Count { max, content: BinaryHeap::new() } } pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> { From 5c95b5c933860ad8f4cd880b5544f21f7fa5ac4b Mon Sep 17 00:00:00 2001 From: shuangcui Date: Thu, 14 Mar 2024 21:28:55 +0800 Subject: [PATCH 27/32] chore: remove repetitive words Signed-off-by: shuangcui --- meilisearch/src/search.rs | 2 +- milli/src/search/facet/facet_range_search.rs | 4 ++-- milli/src/search/new/tests/typo_proximity.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index d98e96e87..e65192d16 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -530,7 +530,7 @@ pub fn perform_search( // The attributes to retrieve are the ones explicitly marked as to retrieve (all by default), // but these attributes must be also be present // - in the fields_ids_map - // - in the the displayed attributes + // - in the displayed attributes let to_retrieve_ids: BTreeSet<_> = query .attributes_to_retrieve .as_ref() diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index f1a26ded5..e340fbac5 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -168,7 +168,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } // should we stop? - // We should if the the search range doesn't include any + // We should if the search range doesn't include any // element from the previous key or its successors let should_stop = { match self.right { @@ -232,7 +232,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } // should we stop? - // We should if the the search range doesn't include any + // We should if the search range doesn't include any // element from the previous key or its successors let should_stop = { match self.right { diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs index 8dd110704..e71d32331 100644 --- a/milli/src/search/new/tests/typo_proximity.rs +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -5,7 +5,7 @@ The typo ranking rule should transform the query graph such that it only contain the combinations of word derivations that it used to compute its bucket. The proximity ranking rule should then look for proximities only between those specific derivations. -For example, given the the search query `beautiful summer` and the dataset: +For example, given the search query `beautiful summer` and the dataset: ```text { "id": 0, "text": "beautigul summer...... beautiful day in the summer" } { "id": 1, "text": "beautiful summer" } From 13cc62728ba346746ace32157731e040fec56326 Mon Sep 17 00:00:00 2001 From: Mohsen Alizadeh Date: Sun, 17 Mar 2024 19:29:42 -0700 Subject: [PATCH 28/32] Fix milli link in contributing doc --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 073da7031..8bf5958c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ First, thank you for contributing to Meilisearch! The goal of this document is t Remember that there are many ways to contribute other than writing code: writing [tutorials or blog posts](https://github.com/meilisearch/awesome-meilisearch), improving [the documentation](https://github.com/meilisearch/documentation), submitting [bug reports](https://github.com/meilisearch/meilisearch/issues/new?assignees=&labels=&template=bug_report.md&title=) and [feature requests](https://github.com/meilisearch/product/discussions/categories/feedback-feature-proposal)... -The code in this repository is only concerned with managing multiple indexes, handling the update store, and exposing an HTTP API. Search and indexation are the domain of our core engine, [`milli`](https://github.com/meilisearch/milli), while tokenization is handled by [our `charabia` library](https://github.com/meilisearch/charabia/). +The code in this repository is only concerned with managing multiple indexes, handling the update store, and exposing an HTTP API. Search and indexation are the domain of our core engine, [`milli`](https://github.com/meilisearch/meilisearch/tree/main/milli), while tokenization is handled by [our `charabia` library](https://github.com/meilisearch/charabia/). If Meilisearch does not offer optimized support for your language, please consider contributing to `charabia` by following the [CONTRIBUTING.md file](https://github.com/meilisearch/charabia/blob/main/CONTRIBUTING.md) and integrating your intended normalizer/segmenter. From f4037c1a95bb2b30b164597af43f2ca5b0a4de44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20U=2E=20-=20curqui?= Date: Mon, 18 Mar 2024 15:39:01 +0100 Subject: [PATCH 29/32] Update CONTRIBUTING.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: ClĂ©ment Renault --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8bf5958c3..24034aba6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ First, thank you for contributing to Meilisearch! The goal of this document is t Remember that there are many ways to contribute other than writing code: writing [tutorials or blog posts](https://github.com/meilisearch/awesome-meilisearch), improving [the documentation](https://github.com/meilisearch/documentation), submitting [bug reports](https://github.com/meilisearch/meilisearch/issues/new?assignees=&labels=&template=bug_report.md&title=) and [feature requests](https://github.com/meilisearch/product/discussions/categories/feedback-feature-proposal)... -The code in this repository is only concerned with managing multiple indexes, handling the update store, and exposing an HTTP API. Search and indexation are the domain of our core engine, [`milli`](https://github.com/meilisearch/meilisearch/tree/main/milli), while tokenization is handled by [our `charabia` library](https://github.com/meilisearch/charabia/). +Meilisearch can manage multiple indexes, handle the update store, and expose an HTTP API. Search and indexation are the domain of our core engine, [`milli`](https://github.com/meilisearch/meilisearch/tree/main/milli), while tokenization is handled by [our `charabia` library](https://github.com/meilisearch/charabia/). If Meilisearch does not offer optimized support for your language, please consider contributing to `charabia` by following the [CONTRIBUTING.md file](https://github.com/meilisearch/charabia/blob/main/CONTRIBUTING.md) and integrating your intended normalizer/segmenter. From a302e258bd8689fce07035327820af536542685c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Mar 2024 16:10:12 +0100 Subject: [PATCH 30/32] Don't display dimensions as 0 when it is not set --- milli/src/vector/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 84d58a996..89571e98a 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -199,7 +199,7 @@ impl From for EmbeddingSettings { model: Setting::Set(options.embedding_model.name().to_owned()), revision: Setting::NotSet, api_key: options.api_key.map(Setting::Set).unwrap_or_default(), - dimensions: Setting::Set(options.dimensions.unwrap_or_default()), + dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), document_template: Setting::Set(prompt.template), }, super::EmbedderOptions::Ollama(options) => Self { From 29e71eedc7cf28f2b486786f99090f93a767f3a0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Mar 2024 18:31:28 +0100 Subject: [PATCH 31/32] Add benchmarks --- .../settings/settings-add-remove-filters.json | 94 ++++++++++++++ .../settings-proximity-precision.json | 86 +++++++++++++ .../settings-remove-add-swap-searchable.json | 114 +++++++++++++++++ workloads/settings/settings-typo.json | 115 ++++++++++++++++++ 4 files changed, 409 insertions(+) create mode 100644 workloads/settings/settings-add-remove-filters.json create mode 100644 workloads/settings/settings-proximity-precision.json create mode 100644 workloads/settings/settings-remove-add-swap-searchable.json create mode 100644 workloads/settings/settings-typo.json diff --git a/workloads/settings/settings-add-remove-filters.json b/workloads/settings/settings-add-remove-filters.json new file mode 100644 index 000000000..04a57c707 --- /dev/null +++ b/workloads/settings/settings-add-remove-filters.json @@ -0,0 +1,94 @@ +{ + "name": "settings-add-remove-filters.json", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "150k-people.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/150k-people.json", + "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" + } + }, + "commands": [ + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ], + "filterableAttributes": [ + "city", + "region", + "country_code" + ], + "dictionary": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ], + "stopWords": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/peoples/documents", + "method": "POST", + "body": { + "asset": "150k-people.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "filterableAttributes": [ + "city", + "region", + "country_code", + "featured_job_title", + "featured_job_organization_name" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "filterableAttributes": [ + "city", + "region", + "country_code" + ] + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/settings/settings-proximity-precision.json b/workloads/settings/settings-proximity-precision.json new file mode 100644 index 000000000..48cfad49d --- /dev/null +++ b/workloads/settings/settings-proximity-precision.json @@ -0,0 +1,86 @@ +{ + "name": "settings-proximity-precision.json", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "150k-people.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/150k-people.json", + "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" + } + }, + "commands": [ + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ], + "filterableAttributes": [ + "city", + "region", + "country_code", + "featured_job_title", + "featured_job_organization_name" + ], + "dictionary": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ], + "stopWords": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/peoples/documents", + "method": "POST", + "body": { + "asset": "150k-people.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "proximityPrecision": "byAttribute" + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "proximityPrecision": "byWord" + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/settings/settings-remove-add-swap-searchable.json b/workloads/settings/settings-remove-add-swap-searchable.json new file mode 100644 index 000000000..ba315680f --- /dev/null +++ b/workloads/settings/settings-remove-add-swap-searchable.json @@ -0,0 +1,114 @@ +{ + "name": "settings-remove-add-swap-searchable.json", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "150k-people.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/150k-people.json", + "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" + } + }, + "commands": [ + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ], + "filterableAttributes": [ + "city", + "region", + "country_code", + "featured_job_title", + "featured_job_organization_name" + ], + "dictionary": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ], + "stopWords": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/peoples/documents", + "method": "POST", + "body": { + "asset": "150k-people.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_organization_name" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "first_name", + "last_name", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ] + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/settings/settings-typo.json b/workloads/settings/settings-typo.json new file mode 100644 index 000000000..a272e6d1f --- /dev/null +++ b/workloads/settings/settings-typo.json @@ -0,0 +1,115 @@ +{ + "name": "settings-typo.json", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "150k-people.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/150k-people.json", + "sha256": "28c359a0956958af0ba204ec11bad3045a0864a10b4838914fea25a01724f84b" + } + }, + "commands": [ + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "last_name", + "first_name", + "featured_job_title", + "featured_job_organization_name", + "facebook_url", + "twitter_url", + "linkedin_url" + ], + "filterableAttributes": [ + "city", + "region", + "country_code", + "featured_job_title", + "featured_job_organization_name" + ], + "dictionary": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ], + "stopWords": [ + "https://", + "http://", + "www.", + "crunchbase.com", + "facebook.com", + "twitter.com", + "linkedin.com" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/peoples/documents", + "method": "POST", + "body": { + "asset": "150k-people.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "typoTolerance": { + "disableOnAttributes": ["featured_job_organization_name"] + } + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "typoTolerance": { + "disableOnAttributes": [] + } + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "typoTolerance": { + "disableOnWords": ["Ben","Elowitz","Kevin","Flaherty", "Ron", "Dustin", "Owen", "Chris", "Mark", "Matt", "Peter", "Van", "Head", "of"] + } + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/peoples/settings", + "method": "PATCH", + "body": { + "inline": { + "typoTolerance": { + "disableOnWords": [] + } + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file From e8516f00c4a00214ec0d3d808074b55f407f5106 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Mar 2024 10:39:21 +0100 Subject: [PATCH 32/32] move settings workload in root workload directory --- workloads/{settings => }/settings-add-remove-filters.json | 0 workloads/{settings => }/settings-proximity-precision.json | 0 workloads/{settings => }/settings-remove-add-swap-searchable.json | 0 workloads/{settings => }/settings-typo.json | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename workloads/{settings => }/settings-add-remove-filters.json (100%) rename workloads/{settings => }/settings-proximity-precision.json (100%) rename workloads/{settings => }/settings-remove-add-swap-searchable.json (100%) rename workloads/{settings => }/settings-typo.json (100%) diff --git a/workloads/settings/settings-add-remove-filters.json b/workloads/settings-add-remove-filters.json similarity index 100% rename from workloads/settings/settings-add-remove-filters.json rename to workloads/settings-add-remove-filters.json diff --git a/workloads/settings/settings-proximity-precision.json b/workloads/settings-proximity-precision.json similarity index 100% rename from workloads/settings/settings-proximity-precision.json rename to workloads/settings-proximity-precision.json diff --git a/workloads/settings/settings-remove-add-swap-searchable.json b/workloads/settings-remove-add-swap-searchable.json similarity index 100% rename from workloads/settings/settings-remove-add-swap-searchable.json rename to workloads/settings-remove-add-swap-searchable.json diff --git a/workloads/settings/settings-typo.json b/workloads/settings-typo.json similarity index 100% rename from workloads/settings/settings-typo.json rename to workloads/settings-typo.json