From 1956045a0619c5f80551c6f662823158f0b6974b Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 27 Nov 2023 14:45:18 +0100 Subject: [PATCH 01/12] add the option --- meilisearch/src/analytics/segment_analytics.rs | 1 + meilisearch/src/option.rs | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 1ad277c28..6a617577b 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -290,6 +290,7 @@ impl From for Infos { http_addr, master_key: _, env, + task_webhook_url: _, max_index_size: _, max_task_db_size: _, http_payload_size_limit, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 1ed20f5b5..37472e174 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -28,6 +28,7 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH"; const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR"; const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; +const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; #[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; @@ -156,6 +157,10 @@ pub struct Opt { #[serde(default = "default_env")] pub env: String, + /// Called whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] + pub task_webhook_url: Option, + /// Deactivates Meilisearch's built-in telemetry when provided. /// /// Meilisearch automatically collects data from all instances that do not opt out using this flag. @@ -375,6 +380,7 @@ impl Opt { http_addr, master_key, env, + task_webhook_url, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -409,6 +415,10 @@ impl Opt { export_to_env_if_not_present(MEILI_MASTER_KEY, master_key); } export_to_env_if_not_present(MEILI_ENV, env); + if let Some(task_webhook_url) = task_webhook_url { + export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url); + } + #[cfg(feature = "analytics")] { export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); From d78ad51082c4cfc3598a0105655f25154c9e9b60 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 27 Nov 2023 15:11:22 +0100 Subject: [PATCH 02/12] Implement the webhook --- Cargo.lock | 1 + index-scheduler/Cargo.toml | 1 + index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 35 ++++++++++++++++++++++++--- meilisearch/src/lib.rs | 1 + 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d57d381ed..ca7eb715f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2792,6 +2792,7 @@ dependencies = [ "tempfile", "thiserror", "time", + "ureq", "uuid 1.5.0", ] diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index c4a37b7d6..4d6e4ffd0 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -30,6 +30,7 @@ synchronoise = "1.0.1" tempfile = "3.5.0" thiserror = "1.0.40" time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] } +ureq = "2.9.1" uuid = { version = "1.3.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index ddb9e934a..9261bf66d 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -37,6 +37,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { snapshots_path: _, auth_path: _, version_file_path: _, + webhook_url: _, test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b9b360fa4..6756990af 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -170,8 +170,8 @@ impl ProcessingTasks { } /// Set the processing tasks to an empty list - fn stop_processing(&mut self) { - self.processing = RoaringBitmap::new(); + fn stop_processing(&mut self) -> RoaringBitmap { + std::mem::take(&mut self.processing) } /// Returns `true` if there, at least, is one task that is currently processing that we must stop. @@ -241,6 +241,7 @@ pub struct IndexSchedulerOptions { pub snapshots_path: PathBuf, /// The path to the folder containing the dumps. pub dumps_path: PathBuf, + pub webhook_url: Option, /// The maximum size, in bytes, of the task index. pub task_db_size: usize, /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. @@ -323,6 +324,9 @@ pub struct IndexScheduler { /// The maximum number of tasks that will be batched together. pub(crate) max_number_of_batched_tasks: usize, + /// The webhook url we should send tasks to after processing every batches. + pub(crate) webhook_url: Option, + /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -388,6 +392,7 @@ impl IndexScheduler { dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), version_file_path: self.version_file_path.clone(), + webhook_url: self.webhook_url.clone(), currently_updating_index: self.currently_updating_index.clone(), embedders: self.embedders.clone(), #[cfg(test)] @@ -487,6 +492,7 @@ impl IndexScheduler { snapshots_path: options.snapshots_path, auth_path: options.auth_path, version_file_path: options.version_file_path, + webhook_url: options.webhook_url, currently_updating_index: Arc::new(RwLock::new(None)), embedders: Default::default(), @@ -1251,19 +1257,41 @@ impl IndexScheduler { } } - self.processing_tasks.write().unwrap().stop_processing(); + let processed = self.processing_tasks.write().unwrap().stop_processing(); #[cfg(test)] self.maybe_fail(tests::FailureLocation::CommittingWtxn)?; wtxn.commit().map_err(Error::HeedTransaction)?; + // We shouldn't crash the tick function if we can't send data to the webhook. + let _ = self.notify_webhook(&processed); + #[cfg(test)] self.breakpoint(Breakpoint::AfterProcessing); Ok(TickOutcome::TickAgain(processed_tasks)) } + /// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one. + fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> { + if let Some(ref url) = self.webhook_url { + let rtxn = self.env.read_txn()?; + + // on average a task takes ~50 bytes + let mut buffer = Vec::with_capacity(updated.len() as usize * 50); + + for id in updated { + let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + let _ = serde_json::to_writer(&mut buffer, &task); + } + + let _ = ureq::post(url).send_bytes(&buffer); + } + + Ok(()) + } + /// Register a task to cleanup the task queue if needed fn cleanup_task_queue(&self) -> Result<()> { let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; @@ -1677,6 +1705,7 @@ mod tests { indexes_path: tempdir.path().join("indexes"), snapshots_path: tempdir.path().join("snapshots"), dumps_path: tempdir.path().join("dumps"), + webhook_url: None, task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. enable_mdb_writemap: false, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index e0f488eea..14a1c5b45 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -228,6 +228,7 @@ fn open_or_create_database_unchecked( indexes_path: opt.db_path.join("indexes"), snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), + webhook_url: opt.task_webhook_url.clone(), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, From 391eb72137e451ff4c6252c752b868695e4d74aa Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 11:39:42 +0100 Subject: [PATCH 03/12] start writing a test with actix but it doesn't works --- meilisearch/tests/tasks/mod.rs | 1 + meilisearch/tests/tasks/webhook.rs | 101 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 meilisearch/tests/tasks/webhook.rs diff --git a/meilisearch/tests/tasks/mod.rs b/meilisearch/tests/tasks/mod.rs index 7a5fa6388..ed387224e 100644 --- a/meilisearch/tests/tasks/mod.rs +++ b/meilisearch/tests/tasks/mod.rs @@ -1,4 +1,5 @@ mod errors; +mod webhook; use meili_snap::insta::assert_json_snapshot; use time::format_description::well_known::Rfc3339; diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs new file mode 100644 index 000000000..34edb53eb --- /dev/null +++ b/meilisearch/tests/tasks/webhook.rs @@ -0,0 +1,101 @@ +use std::sync::Arc; + +use actix_http::body::MessageBody; +use actix_web::dev::{ServiceFactory, ServiceResponse}; +use actix_web::web::{Bytes, Data}; +use actix_web::{post, App, HttpResponse, HttpServer}; +use meili_snap::snapshot; +use meilisearch::Opt; +use tokio::sync::mpsc; + +use crate::common::{default_settings, Server}; +use crate::json; + +#[post("/")] +async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { + println!("Received something"); + let body = body.to_vec(); + sender.send(body).await.unwrap(); + HttpResponse::Ok().into() +} + +fn create_app( + sender: Arc>>, +) -> actix_web::App< + impl ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Response = ServiceResponse, + Error = actix_web::Error, + InitError = (), + >, +> { + App::new().service(forward_body).app_data(Data::from(sender)) +} + +struct WebhookHandle { + pub server_handle: tokio::task::JoinHandle>, + pub url: String, + pub receiver: mpsc::UnboundedReceiver>, +} + +async fn create_webhook_server() -> WebhookHandle { + let (sender, receiver) = mpsc::unbounded_channel(); + let sender = Arc::new(sender); + + let server = + HttpServer::new(move || create_app(sender.clone())).bind(("localhost", 0)).unwrap(); + let (ip, scheme) = server.addrs_with_scheme()[0]; + let url = format!("{scheme}://{ip}/"); + + // TODO: Is it cleaned once the test is over + let server_handle = tokio::spawn(server.run()); + + WebhookHandle { server_handle, url, receiver } +} + +#[actix_web::test] +async fn test_basic_webhook() { + // Request a new server from the pool + let mut handle = create_webhook_server().await; + + let db_path = tempfile::tempdir().unwrap(); + // let (_handle, mut webhook) = create_webhook_server().await; + let server = Server::new_with_options(Opt { + task_webhook_url: Some(handle.url.clone()), + ..default_settings(db_path.path()) + }) + .await + .unwrap(); + + println!("Sending something"); + reqwest::Client::new().post(&handle.url).body("hello").send().await.unwrap(); + + // let (_, status) = server.create_index(json!({ "uid": "tamo" })).await; + // snapshot!(status, @"202 Accepted"); + + let payload = handle.receiver.recv().await.unwrap(); + let jsonl = String::from_utf8(payload).unwrap(); + + // TODO: kill the server + // handle.server_handle.; + + snapshot!(jsonl, + @r###" + { + "uid": 0, + "indexUid": null, + "status": "succeeded", + "type": "dumpCreation", + "canceledBy": null, + "details": { + "dumpUid": "[dumpUid]" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} From fbea72137829e02ad0cc5e1df831ac0e449d4e9b Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 14:47:07 +0100 Subject: [PATCH 04/12] add a first working test with actixweb --- index-scheduler/src/lib.rs | 3 +- meilisearch/tests/tasks/webhook.rs | 46 ++++++++++++------------------ 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 6756990af..8502da242 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1286,7 +1286,8 @@ impl IndexScheduler { let _ = serde_json::to_writer(&mut buffer, &task); } - let _ = ureq::post(url).send_bytes(&buffer); + println!("Sending request to {url}"); + let _ = ureq::post(url).send_bytes(&buffer).unwrap(); } Ok(()) diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 34edb53eb..c95d3fc5b 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -12,10 +12,10 @@ use crate::common::{default_settings, Server}; use crate::json; #[post("/")] -async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { +async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { println!("Received something"); let body = body.to_vec(); - sender.send(body).await.unwrap(); + sender.send(body).unwrap(); HttpResponse::Ok().into() } @@ -40,13 +40,18 @@ struct WebhookHandle { } async fn create_webhook_server() -> WebhookHandle { + let mut log_builder = env_logger::Builder::new(); + log_builder.parse_filters("debug"); + log_builder.init(); + let (sender, receiver) = mpsc::unbounded_channel(); let sender = Arc::new(sender); let server = - HttpServer::new(move || create_app(sender.clone())).bind(("localhost", 0)).unwrap(); + HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap(); let (ip, scheme) = server.addrs_with_scheme()[0]; let url = format!("{scheme}://{ip}/"); + println!("url is {url}"); // TODO: Is it cleaned once the test is over let server_handle = tokio::spawn(server.run()); @@ -60,7 +65,6 @@ async fn test_basic_webhook() { let mut handle = create_webhook_server().await; let db_path = tempfile::tempdir().unwrap(); - // let (_handle, mut webhook) = create_webhook_server().await; let server = Server::new_with_options(Opt { task_webhook_url: Some(handle.url.clone()), ..default_settings(db_path.path()) @@ -68,34 +72,20 @@ async fn test_basic_webhook() { .await .unwrap(); - println!("Sending something"); - reqwest::Client::new().post(&handle.url).body("hello").send().await.unwrap(); - - // let (_, status) = server.create_index(json!({ "uid": "tamo" })).await; - // snapshot!(status, @"202 Accepted"); + let index = server.index("tamo"); + for i in 0..3 { + let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; + } let payload = handle.receiver.recv().await.unwrap(); let jsonl = String::from_utf8(payload).unwrap(); - // TODO: kill the server - // handle.server_handle.; + snapshot!(jsonl, + @r###"{"uid":0,"enqueuedAt":"2023-11-28T13:43:24.754587Z","startedAt":"2023-11-28T13:43:24.756445Z","finishedAt":"2023-11-28T13:43:24.791527Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"ca77ac82-4504-4c85-81a5-1a8d68f1a386","documents_count":1,"allow_index_creation":true}}}"###); + + let payload = handle.receiver.recv().await.unwrap(); + let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###" - { - "uid": 0, - "indexUid": null, - "status": "succeeded", - "type": "dumpCreation", - "canceledBy": null, - "details": { - "dumpUid": "[dumpUid]" - }, - "error": null, - "duration": "[duration]", - "enqueuedAt": "[date]", - "startedAt": "[date]", - "finishedAt": "[date]" - } - "###); + @r###"{"uid":1,"enqueuedAt":"2023-11-28T13:43:24.761498Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"c947aefa-7f98-433d-8ce4-5926d8d2ce10","documents_count":1,"allow_index_creation":true}}}{"uid":2,"enqueuedAt":"2023-11-28T13:43:24.76776Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"a21d6da6-9322-4827-8c08-f33d2e1b6cae","documents_count":1,"allow_index_creation":true}}}"###); } From 3adbc2b942e7bb0808720f46838aab2acef26e00 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 15:08:13 +0100 Subject: [PATCH 05/12] return a task view instead of a task --- index-scheduler/src/lib.rs | 4 +- meilisearch-types/src/lib.rs | 1 + meilisearch-types/src/task_view.rs | 139 ++++++++++++++++++++++++++++ meilisearch/src/routes/tasks.rs | 140 +---------------------------- meilisearch/tests/tasks/webhook.rs | 10 ++- 5 files changed, 153 insertions(+), 141 deletions(-) create mode 100644 meilisearch-types/src/task_view.rs diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8502da242..d96b6c2af 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -54,6 +54,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; +use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; use roaring::RoaringBitmap; @@ -1283,7 +1284,8 @@ impl IndexScheduler { for id in updated { let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - let _ = serde_json::to_writer(&mut buffer, &task); + let _ = serde_json::to_writer(&mut buffer, &TaskView::from_task(&task)); + buffer.push(b'\n'); } println!("Sending request to {url}"); diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index b0762563a..e4f5cbeb4 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -9,6 +9,7 @@ pub mod index_uid_pattern; pub mod keys; pub mod settings; pub mod star_or; +pub mod task_view; pub mod tasks; pub mod versioning; pub use milli::{heed, Index}; diff --git a/meilisearch-types/src/task_view.rs b/meilisearch-types/src/task_view.rs new file mode 100644 index 000000000..02be91a88 --- /dev/null +++ b/meilisearch-types/src/task_view.rs @@ -0,0 +1,139 @@ +use serde::Serialize; +use time::{Duration, OffsetDateTime}; + +use crate::error::ResponseError; +use crate::settings::{Settings, Unchecked}; +use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct TaskView { + pub uid: TaskId, + #[serde(default)] + pub index_uid: Option, + pub status: Status, + #[serde(rename = "type")] + pub kind: Kind, + pub canceled_by: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option, + pub error: Option, + #[serde(serialize_with = "serialize_duration", default)] + pub duration: Option, + #[serde(with = "time::serde::rfc3339")] + pub enqueued_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339::option", default)] + pub started_at: Option, + #[serde(with = "time::serde::rfc3339::option", default)] + pub finished_at: Option, +} + +impl TaskView { + pub fn from_task(task: &Task) -> TaskView { + TaskView { + uid: task.uid, + index_uid: task.index_uid().map(ToOwned::to_owned), + status: task.status, + kind: task.kind.as_kind(), + canceled_by: task.canceled_by, + details: task.details.clone().map(DetailsView::from), + error: task.error.clone(), + duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), + enqueued_at: task.enqueued_at, + started_at: task.started_at, + finished_at: task.finished_at, + } + } +} + +#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DetailsView { + #[serde(skip_serializing_if = "Option::is_none")] + pub received_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub indexed_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub primary_key: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub provided_ids: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub matched_tasks: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub canceled_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub original_filter: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub dump_uid: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(flatten)] + pub settings: Option>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub swaps: Option>, +} + +impl From
for DetailsView { + fn from(details: Details) -> Self { + match details { + Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { + DetailsView { + received_documents: Some(received_documents), + indexed_documents: Some(indexed_documents), + ..DetailsView::default() + } + } + Details::SettingsUpdate { settings } => { + DetailsView { settings: Some(settings), ..DetailsView::default() } + } + Details::IndexInfo { primary_key } => { + DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } + } + Details::DocumentDeletion { + provided_ids: received_document_ids, + deleted_documents, + } => DetailsView { + provided_ids: Some(received_document_ids), + deleted_documents: Some(deleted_documents), + original_filter: Some(None), + ..DetailsView::default() + }, + Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { + DetailsView { + provided_ids: Some(0), + original_filter: Some(Some(original_filter)), + deleted_documents: Some(deleted_documents), + ..DetailsView::default() + } + } + Details::ClearAll { deleted_documents } => { + DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } + } + Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + canceled_tasks: Some(canceled_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + deleted_tasks: Some(deleted_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::Dump { dump_uid } => { + DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } + } + Details::IndexSwap { swaps } => { + DetailsView { swaps: Some(swaps), ..Default::default() } + } + } + } +} diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index f7d4c44d7..03b63001d 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -8,11 +8,9 @@ use meilisearch_types::deserr::DeserrQueryParamError; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{InvalidTaskDateError, ResponseError}; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; -use meilisearch_types::tasks::{ - serialize_duration, Details, IndexSwap, Kind, KindWithContent, Status, Task, -}; +use meilisearch_types::task_view::TaskView; +use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; use serde_json::json; use time::format_description::well_known::Rfc3339; @@ -37,140 +35,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks)))) .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); } - -#[derive(Debug, Clone, PartialEq, Eq, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskView { - pub uid: TaskId, - #[serde(default)] - pub index_uid: Option, - pub status: Status, - #[serde(rename = "type")] - pub kind: Kind, - pub canceled_by: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option, - pub error: Option, - #[serde(serialize_with = "serialize_duration", default)] - pub duration: Option, - #[serde(with = "time::serde::rfc3339")] - pub enqueued_at: OffsetDateTime, - #[serde(with = "time::serde::rfc3339::option", default)] - pub started_at: Option, - #[serde(with = "time::serde::rfc3339::option", default)] - pub finished_at: Option, -} - -impl TaskView { - pub fn from_task(task: &Task) -> TaskView { - TaskView { - uid: task.uid, - index_uid: task.index_uid().map(ToOwned::to_owned), - status: task.status, - kind: task.kind.as_kind(), - canceled_by: task.canceled_by, - details: task.details.clone().map(DetailsView::from), - error: task.error.clone(), - duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), - enqueued_at: task.enqueued_at, - started_at: task.started_at, - finished_at: task.finished_at, - } - } -} - -#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct DetailsView { - #[serde(skip_serializing_if = "Option::is_none")] - pub received_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub indexed_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub primary_key: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub provided_ids: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub matched_tasks: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub canceled_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub original_filter: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub dump_uid: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(flatten)] - pub settings: Option>>, - #[serde(skip_serializing_if = "Option::is_none")] - pub swaps: Option>, -} - -impl From
for DetailsView { - fn from(details: Details) -> Self { - match details { - Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { - DetailsView { - received_documents: Some(received_documents), - indexed_documents: Some(indexed_documents), - ..DetailsView::default() - } - } - Details::SettingsUpdate { settings } => { - DetailsView { settings: Some(settings), ..DetailsView::default() } - } - Details::IndexInfo { primary_key } => { - DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } - } - Details::DocumentDeletion { - provided_ids: received_document_ids, - deleted_documents, - } => DetailsView { - provided_ids: Some(received_document_ids), - deleted_documents: Some(deleted_documents), - original_filter: Some(None), - ..DetailsView::default() - }, - Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { - DetailsView { - provided_ids: Some(0), - original_filter: Some(Some(original_filter)), - deleted_documents: Some(deleted_documents), - ..DetailsView::default() - } - } - Details::ClearAll { deleted_documents } => { - DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } - } - Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - canceled_tasks: Some(canceled_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - deleted_tasks: Some(deleted_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::Dump { dump_uid } => { - DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } - } - Details::IndexSwap { swaps } => { - DetailsView { swaps: Some(swaps), ..Default::default() } - } - } - } -} - #[derive(Debug, Deserr)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] pub struct TasksFilterQuery { diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index c95d3fc5b..613b4ff3f 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -73,6 +73,7 @@ async fn test_basic_webhook() { .unwrap(); let index = server.index("tamo"); + // TODO: may be flaky, we're relying on the fact that during the time the first document addition succeed, the two other operations will be received. for i in 0..3 { let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; } @@ -81,11 +82,16 @@ async fn test_basic_webhook() { let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###"{"uid":0,"enqueuedAt":"2023-11-28T13:43:24.754587Z","startedAt":"2023-11-28T13:43:24.756445Z","finishedAt":"2023-11-28T13:43:24.791527Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"ca77ac82-4504-4c85-81a5-1a8d68f1a386","documents_count":1,"allow_index_creation":true}}}"###); + @r###" + {"uid":0,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.027444S","enqueuedAt":"2023-11-28T14:05:37.767678Z","startedAt":"2023-11-28T14:05:37.769519Z","finishedAt":"2023-11-28T14:05:37.796963Z"} + "###); let payload = handle.receiver.recv().await.unwrap(); let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###"{"uid":1,"enqueuedAt":"2023-11-28T13:43:24.761498Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"c947aefa-7f98-433d-8ce4-5926d8d2ce10","documents_count":1,"allow_index_creation":true}}}{"uid":2,"enqueuedAt":"2023-11-28T13:43:24.76776Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"a21d6da6-9322-4827-8c08-f33d2e1b6cae","documents_count":1,"allow_index_creation":true}}}"###); + @r###" + {"uid":1,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.773731Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} + {"uid":2,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.780466Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} + "###); } From 0b2fff27f231c3eb37c9340a6879135c70a227d2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 15:55:48 +0100 Subject: [PATCH 06/12] update and fix the test --- index-scheduler/src/lib.rs | 1 - meilisearch/tests/tasks/webhook.rs | 73 ++++++++++++++++++++---------- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d96b6c2af..5d44ed104 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1288,7 +1288,6 @@ impl IndexScheduler { buffer.push(b'\n'); } - println!("Sending request to {url}"); let _ = ureq::post(url).send_bytes(&buffer).unwrap(); } diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 613b4ff3f..e852839ec 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -1,10 +1,14 @@ +//! To test the webhook, we need to spawn a new server with a URL listening for +//! post requests. The webhook handle starts a server and forwards all the +//! received requests into a channel for you to handle. + use std::sync::Arc; use actix_http::body::MessageBody; use actix_web::dev::{ServiceFactory, ServiceResponse}; use actix_web::web::{Bytes, Data}; use actix_web::{post, App, HttpResponse, HttpServer}; -use meili_snap::snapshot; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; @@ -13,7 +17,6 @@ use crate::json; #[post("/")] async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { - println!("Received something"); let body = body.to_vec(); sender.send(body).unwrap(); HttpResponse::Ok().into() @@ -47,51 +50,73 @@ async fn create_webhook_server() -> WebhookHandle { let (sender, receiver) = mpsc::unbounded_channel(); let sender = Arc::new(sender); + // By listening on the port 0, the system will give us any available port. let server = HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap(); let (ip, scheme) = server.addrs_with_scheme()[0]; let url = format!("{scheme}://{ip}/"); - println!("url is {url}"); - // TODO: Is it cleaned once the test is over let server_handle = tokio::spawn(server.run()); - WebhookHandle { server_handle, url, receiver } } #[actix_web::test] async fn test_basic_webhook() { - // Request a new server from the pool - let mut handle = create_webhook_server().await; + let WebhookHandle { server_handle, url, mut receiver } = create_webhook_server().await; let db_path = tempfile::tempdir().unwrap(); let server = Server::new_with_options(Opt { - task_webhook_url: Some(handle.url.clone()), + task_webhook_url: Some(url), ..default_settings(db_path.path()) }) .await .unwrap(); let index = server.index("tamo"); - // TODO: may be flaky, we're relying on the fact that during the time the first document addition succeed, the two other operations will be received. - for i in 0..3 { + // May be flaky: we're relying on the fact that while the first document addition is processed, the other + // operations will be received and will be batched together. If it doesn't happen it's not a problem + // the rest of the test won't assume anything about the number of tasks per batch. + for i in 0..5 { let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; } - let payload = handle.receiver.recv().await.unwrap(); - let jsonl = String::from_utf8(payload).unwrap(); + let mut nb_tasks = 0; + while let Some(payload) = receiver.recv().await { + let payload = String::from_utf8(payload).unwrap(); + let jsonl = payload.split('\n'); + for json in jsonl { + if json.is_empty() { + break; // we reached EOF + } + nb_tasks += 1; + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + snapshot!( + json_string!(json, { ".uid" => "[uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": "[uid]", + "indexUid": "tamo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + } + if nb_tasks == 5 { + break; + } + } - snapshot!(jsonl, - @r###" - {"uid":0,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.027444S","enqueuedAt":"2023-11-28T14:05:37.767678Z","startedAt":"2023-11-28T14:05:37.769519Z","finishedAt":"2023-11-28T14:05:37.796963Z"} - "###); + assert!(nb_tasks == 5, "We should have received the 5 tasks but only received {nb_tasks}"); - let payload = handle.receiver.recv().await.unwrap(); - let jsonl = String::from_utf8(payload).unwrap(); - - snapshot!(jsonl, - @r###" - {"uid":1,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.773731Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} - {"uid":2,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.780466Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} - "###); + server_handle.abort(); } From 547379abb018af63e21a550384d19cf2c1fbb033 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 16:28:11 +0100 Subject: [PATCH 07/12] parse the url correctly --- Cargo.lock | 18 ++++++++++-------- meilisearch/Cargo.toml | 1 + meilisearch/src/lib.rs | 2 +- meilisearch/src/option.rs | 9 +++++++-- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ca7eb715f..ab566e836 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1704,9 +1704,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -2756,9 +2756,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -3563,6 +3563,7 @@ dependencies = [ "tokio", "tokio-stream", "toml", + "url", "urlencoding", "uuid 1.5.0", "vergen", @@ -4071,9 +4072,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" @@ -5603,13 +5604,14 @@ dependencies = [ [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index c59b38fa6..434a488f5 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,6 +104,7 @@ walkdir = "2.3.3" yaup = "0.2.1" serde_urlencoded = "0.7.1" termcolor = "1.2.0" +url = { version = "2.5.0", features = ["serde"] } [dev-dependencies] actix-rt = "2.8.0" diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 14a1c5b45..3698e5da4 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -228,7 +228,7 @@ fn open_or_create_database_unchecked( indexes_path: opt.db_path.join("indexes"), snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), - webhook_url: opt.task_webhook_url.clone(), + webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 37472e174..e1f16d888 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -21,6 +21,7 @@ use rustls::RootCertStore; use rustls_pemfile::{certs, pkcs8_private_keys, rsa_private_keys}; use serde::{Deserialize, Serialize}; use sysinfo::{RefreshKind, System, SystemExt}; +use url::Url; const POSSIBLE_ENV: [&str; 2] = ["development", "production"]; @@ -69,6 +70,10 @@ const MEILI_MAX_INDEXING_MEMORY: &str = "MEILI_MAX_INDEXING_MEMORY"; const MEILI_MAX_INDEXING_THREADS: &str = "MEILI_MAX_INDEXING_THREADS"; const DEFAULT_LOG_EVERY_N: usize = 100_000; +fn parse_url(s: &str) -> Result { + Url::parse(s) +} + // Each environment (index and task-db) is taking space in the virtual address space. // Ideally, indexes can occupy 2TiB each to avoid having to manually resize them. // The actual size of the virtual address space is computed at startup to determine how many 2TiB indexes can be @@ -159,7 +164,7 @@ pub struct Opt { /// Called whenever a task finishes so a third party can be notified. #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] - pub task_webhook_url: Option, + pub task_webhook_url: Option, /// Deactivates Meilisearch's built-in telemetry when provided. /// @@ -416,7 +421,7 @@ impl Opt { } export_to_env_if_not_present(MEILI_ENV, env); if let Some(task_webhook_url) = task_webhook_url { - export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url); + export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string()); } #[cfg(feature = "analytics")] From be72326c0a656841e3e5d051284f8997abe2b065 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 13:09:04 +0100 Subject: [PATCH 08/12] gzip the tasks --- Cargo.lock | 5 +++-- index-scheduler/Cargo.toml | 1 + index-scheduler/src/lib.rs | 7 ++++++- meilisearch/tests/tasks/webhook.rs | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab566e836..e632d8621 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1680,9 +1680,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.26" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -2777,6 +2777,7 @@ dependencies = [ "dump", "enum-iterator", "file-store", + "flate2", "insta", "log", "meili-snap", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 4d6e4ffd0..a8c19f435 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -18,6 +18,7 @@ derive_builder = "0.12.0" dump = { path = "../dump" } enum-iterator = "1.4.0" file-store = { path = "../file-store" } +flate2 = "1.0.28" log = "0.4.17" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5d44ed104..bfaca3126 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -45,6 +45,8 @@ use dump::{KindDump, TaskDump, UpdateFile}; pub use error::Error; pub use features::RoFeatures; use file_store::FileStore; +use flate2::bufread::GzEncoder; +use flate2::Compression; use meilisearch_types::error::ResponseError; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; @@ -1288,7 +1290,10 @@ impl IndexScheduler { buffer.push(b'\n'); } - let _ = ureq::post(url).send_bytes(&buffer).unwrap(); + let reader = GzEncoder::new(&buffer[..], Compression::default()); + if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { + log::error!("While sending data to the webhook: {e}"); + } } Ok(()) diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index e852839ec..688d35e8b 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -11,6 +11,7 @@ use actix_web::{post, App, HttpResponse, HttpServer}; use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; +use url::Url; use crate::common::{default_settings, Server}; use crate::json; @@ -66,7 +67,7 @@ async fn test_basic_webhook() { let db_path = tempfile::tempdir().unwrap(); let server = Server::new_with_options(Opt { - task_webhook_url: Some(url), + task_webhook_url: Some(Url::parse(&url).unwrap()), ..default_settings(db_path.path()) }) .await From c83a33017e5224b4f7dfa713201b80b1070b69bc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 14:27:50 +0100 Subject: [PATCH 09/12] stream and chunk the data --- index-scheduler/src/lib.rs | 63 +++++++++++++++++++++++++----- meilisearch/tests/tasks/webhook.rs | 2 +- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index bfaca3126..b5b061a50 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -34,6 +34,7 @@ pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; use std::fs::File; +use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicBool; @@ -1279,18 +1280,60 @@ impl IndexScheduler { /// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one. fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> { if let Some(ref url) = self.webhook_url { - let rtxn = self.env.read_txn()?; - - // on average a task takes ~50 bytes - let mut buffer = Vec::with_capacity(updated.len() as usize * 50); - - for id in updated { - let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - let _ = serde_json::to_writer(&mut buffer, &TaskView::from_task(&task)); - buffer.push(b'\n'); + struct TaskReader<'a, 'b> { + rtxn: &'a RoTxn<'a>, + index_scheduler: &'a IndexScheduler, + tasks: &'b mut roaring::bitmap::Iter<'b>, + buffer: Vec, + written: usize, } - let reader = GzEncoder::new(&buffer[..], Compression::default()); + impl<'a, 'b> Read for TaskReader<'a, 'b> { + fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { + if self.buffer.is_empty() { + match self.tasks.next() { + None => return Ok(0), + Some(task_id) => { + let task = self + .index_scheduler + .get_task(self.rtxn, task_id) + .map_err(io::Error::other)? + .ok_or_else(|| io::Error::other(Error::CorruptedTaskQueue))?; + + serde_json::to_writer( + &mut self.buffer, + &TaskView::from_task(&task), + )?; + self.buffer.push(b'\n'); + } + } + } + + let mut to_write = &self.buffer[self.written..]; + let wrote = io::copy(&mut to_write, &mut buf)?; + self.written += wrote as usize; + + // we wrote everything and must refresh our buffer on the next call + if self.written == self.buffer.len() { + self.written = 0; + self.buffer.clear(); + } + + Ok(wrote as usize) + } + } + + let rtxn = self.env.read_txn()?; + + let task_reader = TaskReader { + rtxn: &rtxn, + index_scheduler: self, + tasks: &mut updated.into_iter(), + buffer: Vec::with_capacity(50), // on average a task is around ~100 bytes + written: 0, + }; + + let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { log::error!("While sending data to the webhook: {e}"); } diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 688d35e8b..6979ff294 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -45,7 +45,7 @@ struct WebhookHandle { async fn create_webhook_server() -> WebhookHandle { let mut log_builder = env_logger::Builder::new(); - log_builder.parse_filters("debug"); + log_builder.parse_filters("info"); log_builder.init(); let (sender, receiver) = mpsc::unbounded_channel(); From 4fb25b878240ab0bc5034c6b25a6cc282884652a Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 14:51:47 +0100 Subject: [PATCH 10/12] fix clippy --- index-scheduler/src/lib.rs | 9 +++++++-- meilisearch/src/option.rs | 4 ---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b5b061a50..2ad263ca4 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1297,8 +1297,13 @@ impl IndexScheduler { let task = self .index_scheduler .get_task(self.rtxn, task_id) - .map_err(io::Error::other)? - .ok_or_else(|| io::Error::other(Error::CorruptedTaskQueue))?; + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + Error::CorruptedTaskQueue, + ) + })?; serde_json::to_writer( &mut self.buffer, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index e1f16d888..abb2bab6c 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -70,10 +70,6 @@ const MEILI_MAX_INDEXING_MEMORY: &str = "MEILI_MAX_INDEXING_MEMORY"; const MEILI_MAX_INDEXING_THREADS: &str = "MEILI_MAX_INDEXING_THREADS"; const DEFAULT_LOG_EVERY_N: usize = 100_000; -fn parse_url(s: &str) -> Result { - Url::parse(s) -} - // Each environment (index and task-db) is taking space in the virtual address space. // Ideally, indexes can occupy 2TiB each to avoid having to manually resize them. // The actual size of the virtual address space is computed at startup to determine how many 2TiB indexes can be From 19736cefe858377c4c6143daa05479361b87ad8e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Dec 2023 10:38:01 +0100 Subject: [PATCH 11/12] add the analytics --- meilisearch/src/analytics/segment_analytics.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 6a617577b..0cdb18540 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -264,6 +264,7 @@ struct Infos { ignore_snapshot_if_db_exists: bool, http_addr: bool, http_payload_size_limit: Byte, + task_queue_webhook: bool, log_level: String, max_indexing_memory: MaxMemory, max_indexing_threads: MaxThreads, @@ -290,7 +291,7 @@ impl From for Infos { http_addr, master_key: _, env, - task_webhook_url: _, + task_webhook_url, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -344,6 +345,7 @@ impl From for Infos { http_addr: http_addr != default_http_addr(), http_payload_size_limit, experimental_max_number_of_batched_tasks, + task_queue_webhook: task_webhook_url.is_some(), log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, From fa2b96b9a5e80fc5ba18a5d78d30113ca0ee6d37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 19 Dec 2023 12:18:45 +0100 Subject: [PATCH 12/12] Add an Authorization Header along with the webhook calls --- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 17 ++++++++++++++++- meilisearch/src/analytics/segment_analytics.rs | 3 +++ meilisearch/src/lib.rs | 1 + meilisearch/src/option.rs | 12 ++++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 9261bf66d..0adda43ff 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -38,6 +38,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { auth_path: _, version_file_path: _, webhook_url: _, + webhook_authorization_header: _, test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 2ad263ca4..296f8add1 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -245,7 +245,10 @@ pub struct IndexSchedulerOptions { pub snapshots_path: PathBuf, /// The path to the folder containing the dumps. pub dumps_path: PathBuf, + /// The URL on which we must send the tasks statuses pub webhook_url: Option, + /// The value we will send into the Authorization HTTP header on the webhook URL + pub webhook_authorization_header: Option, /// The maximum size, in bytes, of the task index. pub task_db_size: usize, /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. @@ -330,6 +333,8 @@ pub struct IndexScheduler { /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, + /// The Authorization header to send to the webhook URL. + pub(crate) webhook_authorization_header: Option, /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -397,6 +402,7 @@ impl IndexScheduler { auth_path: self.auth_path.clone(), version_file_path: self.version_file_path.clone(), webhook_url: self.webhook_url.clone(), + webhook_authorization_header: self.webhook_authorization_header.clone(), currently_updating_index: self.currently_updating_index.clone(), embedders: self.embedders.clone(), #[cfg(test)] @@ -497,6 +503,7 @@ impl IndexScheduler { auth_path: options.auth_path, version_file_path: options.version_file_path, webhook_url: options.webhook_url, + webhook_authorization_header: options.webhook_authorization_header, currently_updating_index: Arc::new(RwLock::new(None)), embedders: Default::default(), @@ -1338,8 +1345,15 @@ impl IndexScheduler { written: 0, }; + // let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); - if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { + let request = ureq::post(url).set("Content-Encoding", "gzip"); + let request = match &self.webhook_authorization_header { + Some(header) => request.set("Authorization", header), + None => request, + }; + + if let Err(e) = request.send(reader) { log::error!("While sending data to the webhook: {e}"); } } @@ -1761,6 +1775,7 @@ mod tests { snapshots_path: tempdir.path().join("snapshots"), dumps_path: tempdir.path().join("dumps"), webhook_url: None, + webhook_authorization_header: None, task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. enable_mdb_writemap: false, diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 0cdb18540..86a5eddb9 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -265,6 +265,7 @@ struct Infos { http_addr: bool, http_payload_size_limit: Byte, task_queue_webhook: bool, + task_webhook_authorization_header: bool, log_level: String, max_indexing_memory: MaxMemory, max_indexing_threads: MaxThreads, @@ -292,6 +293,7 @@ impl From for Infos { master_key: _, env, task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -346,6 +348,7 @@ impl From for Infos { http_payload_size_limit, experimental_max_number_of_batched_tasks, task_queue_webhook: task_webhook_url.is_some(), + task_webhook_authorization_header: task_webhook_authorization_header.is_some(), log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 3698e5da4..f1111962c 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -229,6 +229,7 @@ fn open_or_create_database_unchecked( snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), + webhook_authorization_header: opt.task_webhook_authorization_header.clone(), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index abb2bab6c..a0672c9cf 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -30,6 +30,7 @@ const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR"; const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; +const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; #[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; @@ -162,6 +163,10 @@ pub struct Opt { #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] pub task_webhook_url: Option, + /// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)] + pub task_webhook_authorization_header: Option, + /// Deactivates Meilisearch's built-in telemetry when provided. /// /// Meilisearch automatically collects data from all instances that do not opt out using this flag. @@ -382,6 +387,7 @@ impl Opt { master_key, env, task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -419,6 +425,12 @@ impl Opt { if let Some(task_webhook_url) = task_webhook_url { export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string()); } + if let Some(task_webhook_authorization_header) = task_webhook_authorization_header { + export_to_env_if_not_present( + MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER, + task_webhook_authorization_header, + ); + } #[cfg(feature = "analytics")] {