From 9df8cfc013452ecb5935d5501c96a4c465183a5d Mon Sep 17 00:00:00 2001 From: Paul Sanders Date: Mon, 18 Dec 2023 13:05:46 -0500 Subject: [PATCH 01/41] Add libstdc++ in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2cc1dcaec..f6be39785 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ ENV MEILI_HTTP_ADDR 0.0.0.0:7700 ENV MEILI_SERVER_PROVIDER docker RUN apk update --quiet \ - && apk add -q --no-cache libgcc tini curl + && apk add -q --no-cache libgcc libstdc++ tini curl # add meilisearch and meilitool to the `/bin` so you can run it from anywhere # and it's easy to find. From 942d49314c75d00aa5e43df220d5acfef47e6b64 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Dec 2023 22:14:26 +0100 Subject: [PATCH 02/41] Remove dependency that requires libstdc++ --- Cargo.lock | 5 ----- milli/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d57d381ed..edcd26bf9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1592,9 +1592,6 @@ name = "esaxx-rs" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" -dependencies = [ - "cc", -] [[package]] name = "fancy-regex" @@ -5310,11 +5307,9 @@ version = "0.14.1" source = "git+https://github.com/huggingface/tokenizers.git?tag=v0.14.1#6357206cdcce4d78ffb1e0372feb456caea09375" dependencies = [ "aho-corasick", - "clap", "derive_builder", "esaxx-rs", "getrandom", - "indicatif", "itertools 0.11.0", "lazy_static", "log", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b977d64f1..3e10c175a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -77,7 +77,7 @@ csv = "1.2.1" candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1" } +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ "online", ] } From b2193e612ff447fe152b4fc990a48e76516b43d1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Dec 2023 22:17:29 +0100 Subject: [PATCH 03/41] Revert "Add libstdc++ in Dockerfile" as it is no longer needed This reverts commit 9df8cfc013452ecb5935d5501c96a4c465183a5d. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f6be39785..2cc1dcaec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ ENV MEILI_HTTP_ADDR 0.0.0.0:7700 ENV MEILI_SERVER_PROVIDER docker RUN apk update --quiet \ - && apk add -q --no-cache libgcc libstdc++ tini curl + && apk add -q --no-cache libgcc tini curl # add meilisearch and meilitool to the `/bin` so you can run it from anywhere # and it's easy to find. From 1956045a0619c5f80551c6f662823158f0b6974b Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 27 Nov 2023 14:45:18 +0100 Subject: [PATCH 04/41] add the option --- meilisearch/src/analytics/segment_analytics.rs | 1 + meilisearch/src/option.rs | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 1ad277c28..6a617577b 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -290,6 +290,7 @@ impl From for Infos { http_addr, master_key: _, env, + task_webhook_url: _, max_index_size: _, max_task_db_size: _, http_payload_size_limit, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 1ed20f5b5..37472e174 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -28,6 +28,7 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH"; const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR"; const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; +const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; #[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; @@ -156,6 +157,10 @@ pub struct Opt { #[serde(default = "default_env")] pub env: String, + /// Called whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] + pub task_webhook_url: Option, + /// Deactivates Meilisearch's built-in telemetry when provided. /// /// Meilisearch automatically collects data from all instances that do not opt out using this flag. @@ -375,6 +380,7 @@ impl Opt { http_addr, master_key, env, + task_webhook_url, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -409,6 +415,10 @@ impl Opt { export_to_env_if_not_present(MEILI_MASTER_KEY, master_key); } export_to_env_if_not_present(MEILI_ENV, env); + if let Some(task_webhook_url) = task_webhook_url { + export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url); + } + #[cfg(feature = "analytics")] { export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); From d78ad51082c4cfc3598a0105655f25154c9e9b60 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 27 Nov 2023 15:11:22 +0100 Subject: [PATCH 05/41] Implement the webhook --- Cargo.lock | 1 + index-scheduler/Cargo.toml | 1 + index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 35 ++++++++++++++++++++++++--- meilisearch/src/lib.rs | 1 + 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d57d381ed..ca7eb715f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2792,6 +2792,7 @@ dependencies = [ "tempfile", "thiserror", "time", + "ureq", "uuid 1.5.0", ] diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index c4a37b7d6..4d6e4ffd0 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -30,6 +30,7 @@ synchronoise = "1.0.1" tempfile = "3.5.0" thiserror = "1.0.40" time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] } +ureq = "2.9.1" uuid = { version = "1.3.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index ddb9e934a..9261bf66d 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -37,6 +37,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { snapshots_path: _, auth_path: _, version_file_path: _, + webhook_url: _, test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b9b360fa4..6756990af 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -170,8 +170,8 @@ impl ProcessingTasks { } /// Set the processing tasks to an empty list - fn stop_processing(&mut self) { - self.processing = RoaringBitmap::new(); + fn stop_processing(&mut self) -> RoaringBitmap { + std::mem::take(&mut self.processing) } /// Returns `true` if there, at least, is one task that is currently processing that we must stop. @@ -241,6 +241,7 @@ pub struct IndexSchedulerOptions { pub snapshots_path: PathBuf, /// The path to the folder containing the dumps. pub dumps_path: PathBuf, + pub webhook_url: Option, /// The maximum size, in bytes, of the task index. pub task_db_size: usize, /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. @@ -323,6 +324,9 @@ pub struct IndexScheduler { /// The maximum number of tasks that will be batched together. pub(crate) max_number_of_batched_tasks: usize, + /// The webhook url we should send tasks to after processing every batches. + pub(crate) webhook_url: Option, + /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -388,6 +392,7 @@ impl IndexScheduler { dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), version_file_path: self.version_file_path.clone(), + webhook_url: self.webhook_url.clone(), currently_updating_index: self.currently_updating_index.clone(), embedders: self.embedders.clone(), #[cfg(test)] @@ -487,6 +492,7 @@ impl IndexScheduler { snapshots_path: options.snapshots_path, auth_path: options.auth_path, version_file_path: options.version_file_path, + webhook_url: options.webhook_url, currently_updating_index: Arc::new(RwLock::new(None)), embedders: Default::default(), @@ -1251,19 +1257,41 @@ impl IndexScheduler { } } - self.processing_tasks.write().unwrap().stop_processing(); + let processed = self.processing_tasks.write().unwrap().stop_processing(); #[cfg(test)] self.maybe_fail(tests::FailureLocation::CommittingWtxn)?; wtxn.commit().map_err(Error::HeedTransaction)?; + // We shouldn't crash the tick function if we can't send data to the webhook. + let _ = self.notify_webhook(&processed); + #[cfg(test)] self.breakpoint(Breakpoint::AfterProcessing); Ok(TickOutcome::TickAgain(processed_tasks)) } + /// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one. + fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> { + if let Some(ref url) = self.webhook_url { + let rtxn = self.env.read_txn()?; + + // on average a task takes ~50 bytes + let mut buffer = Vec::with_capacity(updated.len() as usize * 50); + + for id in updated { + let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + let _ = serde_json::to_writer(&mut buffer, &task); + } + + let _ = ureq::post(url).send_bytes(&buffer); + } + + Ok(()) + } + /// Register a task to cleanup the task queue if needed fn cleanup_task_queue(&self) -> Result<()> { let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; @@ -1677,6 +1705,7 @@ mod tests { indexes_path: tempdir.path().join("indexes"), snapshots_path: tempdir.path().join("snapshots"), dumps_path: tempdir.path().join("dumps"), + webhook_url: None, task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. enable_mdb_writemap: false, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index e0f488eea..14a1c5b45 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -228,6 +228,7 @@ fn open_or_create_database_unchecked( indexes_path: opt.db_path.join("indexes"), snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), + webhook_url: opt.task_webhook_url.clone(), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, From 391eb72137e451ff4c6252c752b868695e4d74aa Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 11:39:42 +0100 Subject: [PATCH 06/41] start writing a test with actix but it doesn't works --- meilisearch/tests/tasks/mod.rs | 1 + meilisearch/tests/tasks/webhook.rs | 101 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 meilisearch/tests/tasks/webhook.rs diff --git a/meilisearch/tests/tasks/mod.rs b/meilisearch/tests/tasks/mod.rs index 7a5fa6388..ed387224e 100644 --- a/meilisearch/tests/tasks/mod.rs +++ b/meilisearch/tests/tasks/mod.rs @@ -1,4 +1,5 @@ mod errors; +mod webhook; use meili_snap::insta::assert_json_snapshot; use time::format_description::well_known::Rfc3339; diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs new file mode 100644 index 000000000..34edb53eb --- /dev/null +++ b/meilisearch/tests/tasks/webhook.rs @@ -0,0 +1,101 @@ +use std::sync::Arc; + +use actix_http::body::MessageBody; +use actix_web::dev::{ServiceFactory, ServiceResponse}; +use actix_web::web::{Bytes, Data}; +use actix_web::{post, App, HttpResponse, HttpServer}; +use meili_snap::snapshot; +use meilisearch::Opt; +use tokio::sync::mpsc; + +use crate::common::{default_settings, Server}; +use crate::json; + +#[post("/")] +async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { + println!("Received something"); + let body = body.to_vec(); + sender.send(body).await.unwrap(); + HttpResponse::Ok().into() +} + +fn create_app( + sender: Arc>>, +) -> actix_web::App< + impl ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Response = ServiceResponse, + Error = actix_web::Error, + InitError = (), + >, +> { + App::new().service(forward_body).app_data(Data::from(sender)) +} + +struct WebhookHandle { + pub server_handle: tokio::task::JoinHandle>, + pub url: String, + pub receiver: mpsc::UnboundedReceiver>, +} + +async fn create_webhook_server() -> WebhookHandle { + let (sender, receiver) = mpsc::unbounded_channel(); + let sender = Arc::new(sender); + + let server = + HttpServer::new(move || create_app(sender.clone())).bind(("localhost", 0)).unwrap(); + let (ip, scheme) = server.addrs_with_scheme()[0]; + let url = format!("{scheme}://{ip}/"); + + // TODO: Is it cleaned once the test is over + let server_handle = tokio::spawn(server.run()); + + WebhookHandle { server_handle, url, receiver } +} + +#[actix_web::test] +async fn test_basic_webhook() { + // Request a new server from the pool + let mut handle = create_webhook_server().await; + + let db_path = tempfile::tempdir().unwrap(); + // let (_handle, mut webhook) = create_webhook_server().await; + let server = Server::new_with_options(Opt { + task_webhook_url: Some(handle.url.clone()), + ..default_settings(db_path.path()) + }) + .await + .unwrap(); + + println!("Sending something"); + reqwest::Client::new().post(&handle.url).body("hello").send().await.unwrap(); + + // let (_, status) = server.create_index(json!({ "uid": "tamo" })).await; + // snapshot!(status, @"202 Accepted"); + + let payload = handle.receiver.recv().await.unwrap(); + let jsonl = String::from_utf8(payload).unwrap(); + + // TODO: kill the server + // handle.server_handle.; + + snapshot!(jsonl, + @r###" + { + "uid": 0, + "indexUid": null, + "status": "succeeded", + "type": "dumpCreation", + "canceledBy": null, + "details": { + "dumpUid": "[dumpUid]" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} From fbea72137829e02ad0cc5e1df831ac0e449d4e9b Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 14:47:07 +0100 Subject: [PATCH 07/41] add a first working test with actixweb --- index-scheduler/src/lib.rs | 3 +- meilisearch/tests/tasks/webhook.rs | 46 ++++++++++++------------------ 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 6756990af..8502da242 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1286,7 +1286,8 @@ impl IndexScheduler { let _ = serde_json::to_writer(&mut buffer, &task); } - let _ = ureq::post(url).send_bytes(&buffer); + println!("Sending request to {url}"); + let _ = ureq::post(url).send_bytes(&buffer).unwrap(); } Ok(()) diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 34edb53eb..c95d3fc5b 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -12,10 +12,10 @@ use crate::common::{default_settings, Server}; use crate::json; #[post("/")] -async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { +async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { println!("Received something"); let body = body.to_vec(); - sender.send(body).await.unwrap(); + sender.send(body).unwrap(); HttpResponse::Ok().into() } @@ -40,13 +40,18 @@ struct WebhookHandle { } async fn create_webhook_server() -> WebhookHandle { + let mut log_builder = env_logger::Builder::new(); + log_builder.parse_filters("debug"); + log_builder.init(); + let (sender, receiver) = mpsc::unbounded_channel(); let sender = Arc::new(sender); let server = - HttpServer::new(move || create_app(sender.clone())).bind(("localhost", 0)).unwrap(); + HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap(); let (ip, scheme) = server.addrs_with_scheme()[0]; let url = format!("{scheme}://{ip}/"); + println!("url is {url}"); // TODO: Is it cleaned once the test is over let server_handle = tokio::spawn(server.run()); @@ -60,7 +65,6 @@ async fn test_basic_webhook() { let mut handle = create_webhook_server().await; let db_path = tempfile::tempdir().unwrap(); - // let (_handle, mut webhook) = create_webhook_server().await; let server = Server::new_with_options(Opt { task_webhook_url: Some(handle.url.clone()), ..default_settings(db_path.path()) @@ -68,34 +72,20 @@ async fn test_basic_webhook() { .await .unwrap(); - println!("Sending something"); - reqwest::Client::new().post(&handle.url).body("hello").send().await.unwrap(); - - // let (_, status) = server.create_index(json!({ "uid": "tamo" })).await; - // snapshot!(status, @"202 Accepted"); + let index = server.index("tamo"); + for i in 0..3 { + let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; + } let payload = handle.receiver.recv().await.unwrap(); let jsonl = String::from_utf8(payload).unwrap(); - // TODO: kill the server - // handle.server_handle.; + snapshot!(jsonl, + @r###"{"uid":0,"enqueuedAt":"2023-11-28T13:43:24.754587Z","startedAt":"2023-11-28T13:43:24.756445Z","finishedAt":"2023-11-28T13:43:24.791527Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"ca77ac82-4504-4c85-81a5-1a8d68f1a386","documents_count":1,"allow_index_creation":true}}}"###); + + let payload = handle.receiver.recv().await.unwrap(); + let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###" - { - "uid": 0, - "indexUid": null, - "status": "succeeded", - "type": "dumpCreation", - "canceledBy": null, - "details": { - "dumpUid": "[dumpUid]" - }, - "error": null, - "duration": "[duration]", - "enqueuedAt": "[date]", - "startedAt": "[date]", - "finishedAt": "[date]" - } - "###); + @r###"{"uid":1,"enqueuedAt":"2023-11-28T13:43:24.761498Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"c947aefa-7f98-433d-8ce4-5926d8d2ce10","documents_count":1,"allow_index_creation":true}}}{"uid":2,"enqueuedAt":"2023-11-28T13:43:24.76776Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"a21d6da6-9322-4827-8c08-f33d2e1b6cae","documents_count":1,"allow_index_creation":true}}}"###); } From 3adbc2b942e7bb0808720f46838aab2acef26e00 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 15:08:13 +0100 Subject: [PATCH 08/41] return a task view instead of a task --- index-scheduler/src/lib.rs | 4 +- meilisearch-types/src/lib.rs | 1 + meilisearch-types/src/task_view.rs | 139 ++++++++++++++++++++++++++++ meilisearch/src/routes/tasks.rs | 140 +---------------------------- meilisearch/tests/tasks/webhook.rs | 10 ++- 5 files changed, 153 insertions(+), 141 deletions(-) create mode 100644 meilisearch-types/src/task_view.rs diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8502da242..d96b6c2af 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -54,6 +54,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; +use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; use roaring::RoaringBitmap; @@ -1283,7 +1284,8 @@ impl IndexScheduler { for id in updated { let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - let _ = serde_json::to_writer(&mut buffer, &task); + let _ = serde_json::to_writer(&mut buffer, &TaskView::from_task(&task)); + buffer.push(b'\n'); } println!("Sending request to {url}"); diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index b0762563a..e4f5cbeb4 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -9,6 +9,7 @@ pub mod index_uid_pattern; pub mod keys; pub mod settings; pub mod star_or; +pub mod task_view; pub mod tasks; pub mod versioning; pub use milli::{heed, Index}; diff --git a/meilisearch-types/src/task_view.rs b/meilisearch-types/src/task_view.rs new file mode 100644 index 000000000..02be91a88 --- /dev/null +++ b/meilisearch-types/src/task_view.rs @@ -0,0 +1,139 @@ +use serde::Serialize; +use time::{Duration, OffsetDateTime}; + +use crate::error::ResponseError; +use crate::settings::{Settings, Unchecked}; +use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct TaskView { + pub uid: TaskId, + #[serde(default)] + pub index_uid: Option, + pub status: Status, + #[serde(rename = "type")] + pub kind: Kind, + pub canceled_by: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option, + pub error: Option, + #[serde(serialize_with = "serialize_duration", default)] + pub duration: Option, + #[serde(with = "time::serde::rfc3339")] + pub enqueued_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339::option", default)] + pub started_at: Option, + #[serde(with = "time::serde::rfc3339::option", default)] + pub finished_at: Option, +} + +impl TaskView { + pub fn from_task(task: &Task) -> TaskView { + TaskView { + uid: task.uid, + index_uid: task.index_uid().map(ToOwned::to_owned), + status: task.status, + kind: task.kind.as_kind(), + canceled_by: task.canceled_by, + details: task.details.clone().map(DetailsView::from), + error: task.error.clone(), + duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), + enqueued_at: task.enqueued_at, + started_at: task.started_at, + finished_at: task.finished_at, + } + } +} + +#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DetailsView { + #[serde(skip_serializing_if = "Option::is_none")] + pub received_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub indexed_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub primary_key: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub provided_ids: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub matched_tasks: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub canceled_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub original_filter: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub dump_uid: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(flatten)] + pub settings: Option>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub swaps: Option>, +} + +impl From
for DetailsView { + fn from(details: Details) -> Self { + match details { + Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { + DetailsView { + received_documents: Some(received_documents), + indexed_documents: Some(indexed_documents), + ..DetailsView::default() + } + } + Details::SettingsUpdate { settings } => { + DetailsView { settings: Some(settings), ..DetailsView::default() } + } + Details::IndexInfo { primary_key } => { + DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } + } + Details::DocumentDeletion { + provided_ids: received_document_ids, + deleted_documents, + } => DetailsView { + provided_ids: Some(received_document_ids), + deleted_documents: Some(deleted_documents), + original_filter: Some(None), + ..DetailsView::default() + }, + Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { + DetailsView { + provided_ids: Some(0), + original_filter: Some(Some(original_filter)), + deleted_documents: Some(deleted_documents), + ..DetailsView::default() + } + } + Details::ClearAll { deleted_documents } => { + DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } + } + Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + canceled_tasks: Some(canceled_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + deleted_tasks: Some(deleted_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::Dump { dump_uid } => { + DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } + } + Details::IndexSwap { swaps } => { + DetailsView { swaps: Some(swaps), ..Default::default() } + } + } + } +} diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index f7d4c44d7..03b63001d 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -8,11 +8,9 @@ use meilisearch_types::deserr::DeserrQueryParamError; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{InvalidTaskDateError, ResponseError}; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; -use meilisearch_types::tasks::{ - serialize_duration, Details, IndexSwap, Kind, KindWithContent, Status, Task, -}; +use meilisearch_types::task_view::TaskView; +use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; use serde_json::json; use time::format_description::well_known::Rfc3339; @@ -37,140 +35,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks)))) .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); } - -#[derive(Debug, Clone, PartialEq, Eq, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskView { - pub uid: TaskId, - #[serde(default)] - pub index_uid: Option, - pub status: Status, - #[serde(rename = "type")] - pub kind: Kind, - pub canceled_by: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option, - pub error: Option, - #[serde(serialize_with = "serialize_duration", default)] - pub duration: Option, - #[serde(with = "time::serde::rfc3339")] - pub enqueued_at: OffsetDateTime, - #[serde(with = "time::serde::rfc3339::option", default)] - pub started_at: Option, - #[serde(with = "time::serde::rfc3339::option", default)] - pub finished_at: Option, -} - -impl TaskView { - pub fn from_task(task: &Task) -> TaskView { - TaskView { - uid: task.uid, - index_uid: task.index_uid().map(ToOwned::to_owned), - status: task.status, - kind: task.kind.as_kind(), - canceled_by: task.canceled_by, - details: task.details.clone().map(DetailsView::from), - error: task.error.clone(), - duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), - enqueued_at: task.enqueued_at, - started_at: task.started_at, - finished_at: task.finished_at, - } - } -} - -#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct DetailsView { - #[serde(skip_serializing_if = "Option::is_none")] - pub received_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub indexed_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub primary_key: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub provided_ids: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub matched_tasks: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub canceled_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub original_filter: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub dump_uid: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(flatten)] - pub settings: Option>>, - #[serde(skip_serializing_if = "Option::is_none")] - pub swaps: Option>, -} - -impl From
for DetailsView { - fn from(details: Details) -> Self { - match details { - Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { - DetailsView { - received_documents: Some(received_documents), - indexed_documents: Some(indexed_documents), - ..DetailsView::default() - } - } - Details::SettingsUpdate { settings } => { - DetailsView { settings: Some(settings), ..DetailsView::default() } - } - Details::IndexInfo { primary_key } => { - DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } - } - Details::DocumentDeletion { - provided_ids: received_document_ids, - deleted_documents, - } => DetailsView { - provided_ids: Some(received_document_ids), - deleted_documents: Some(deleted_documents), - original_filter: Some(None), - ..DetailsView::default() - }, - Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { - DetailsView { - provided_ids: Some(0), - original_filter: Some(Some(original_filter)), - deleted_documents: Some(deleted_documents), - ..DetailsView::default() - } - } - Details::ClearAll { deleted_documents } => { - DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } - } - Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - canceled_tasks: Some(canceled_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - deleted_tasks: Some(deleted_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::Dump { dump_uid } => { - DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } - } - Details::IndexSwap { swaps } => { - DetailsView { swaps: Some(swaps), ..Default::default() } - } - } - } -} - #[derive(Debug, Deserr)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] pub struct TasksFilterQuery { diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index c95d3fc5b..613b4ff3f 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -73,6 +73,7 @@ async fn test_basic_webhook() { .unwrap(); let index = server.index("tamo"); + // TODO: may be flaky, we're relying on the fact that during the time the first document addition succeed, the two other operations will be received. for i in 0..3 { let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; } @@ -81,11 +82,16 @@ async fn test_basic_webhook() { let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###"{"uid":0,"enqueuedAt":"2023-11-28T13:43:24.754587Z","startedAt":"2023-11-28T13:43:24.756445Z","finishedAt":"2023-11-28T13:43:24.791527Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"ca77ac82-4504-4c85-81a5-1a8d68f1a386","documents_count":1,"allow_index_creation":true}}}"###); + @r###" + {"uid":0,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.027444S","enqueuedAt":"2023-11-28T14:05:37.767678Z","startedAt":"2023-11-28T14:05:37.769519Z","finishedAt":"2023-11-28T14:05:37.796963Z"} + "###); let payload = handle.receiver.recv().await.unwrap(); let jsonl = String::from_utf8(payload).unwrap(); snapshot!(jsonl, - @r###"{"uid":1,"enqueuedAt":"2023-11-28T13:43:24.761498Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"c947aefa-7f98-433d-8ce4-5926d8d2ce10","documents_count":1,"allow_index_creation":true}}}{"uid":2,"enqueuedAt":"2023-11-28T13:43:24.76776Z","startedAt":"2023-11-28T13:43:24.793989Z","finishedAt":"2023-11-28T13:43:24.814623Z","error":null,"canceledBy":null,"details":{"DocumentAdditionOrUpdate":{"received_documents":1,"indexed_documents":1}},"status":"succeeded","kind":{"documentAdditionOrUpdate":{"index_uid":"tamo","primary_key":null,"method":"ReplaceDocuments","content_file":"a21d6da6-9322-4827-8c08-f33d2e1b6cae","documents_count":1,"allow_index_creation":true}}}"###); + @r###" + {"uid":1,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.773731Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} + {"uid":2,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.780466Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} + "###); } From 0b2fff27f231c3eb37c9340a6879135c70a227d2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 15:55:48 +0100 Subject: [PATCH 09/41] update and fix the test --- index-scheduler/src/lib.rs | 1 - meilisearch/tests/tasks/webhook.rs | 73 ++++++++++++++++++++---------- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d96b6c2af..5d44ed104 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1288,7 +1288,6 @@ impl IndexScheduler { buffer.push(b'\n'); } - println!("Sending request to {url}"); let _ = ureq::post(url).send_bytes(&buffer).unwrap(); } diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 613b4ff3f..e852839ec 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -1,10 +1,14 @@ +//! To test the webhook, we need to spawn a new server with a URL listening for +//! post requests. The webhook handle starts a server and forwards all the +//! received requests into a channel for you to handle. + use std::sync::Arc; use actix_http::body::MessageBody; use actix_web::dev::{ServiceFactory, ServiceResponse}; use actix_web::web::{Bytes, Data}; use actix_web::{post, App, HttpResponse, HttpServer}; -use meili_snap::snapshot; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; @@ -13,7 +17,6 @@ use crate::json; #[post("/")] async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { - println!("Received something"); let body = body.to_vec(); sender.send(body).unwrap(); HttpResponse::Ok().into() @@ -47,51 +50,73 @@ async fn create_webhook_server() -> WebhookHandle { let (sender, receiver) = mpsc::unbounded_channel(); let sender = Arc::new(sender); + // By listening on the port 0, the system will give us any available port. let server = HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap(); let (ip, scheme) = server.addrs_with_scheme()[0]; let url = format!("{scheme}://{ip}/"); - println!("url is {url}"); - // TODO: Is it cleaned once the test is over let server_handle = tokio::spawn(server.run()); - WebhookHandle { server_handle, url, receiver } } #[actix_web::test] async fn test_basic_webhook() { - // Request a new server from the pool - let mut handle = create_webhook_server().await; + let WebhookHandle { server_handle, url, mut receiver } = create_webhook_server().await; let db_path = tempfile::tempdir().unwrap(); let server = Server::new_with_options(Opt { - task_webhook_url: Some(handle.url.clone()), + task_webhook_url: Some(url), ..default_settings(db_path.path()) }) .await .unwrap(); let index = server.index("tamo"); - // TODO: may be flaky, we're relying on the fact that during the time the first document addition succeed, the two other operations will be received. - for i in 0..3 { + // May be flaky: we're relying on the fact that while the first document addition is processed, the other + // operations will be received and will be batched together. If it doesn't happen it's not a problem + // the rest of the test won't assume anything about the number of tasks per batch. + for i in 0..5 { let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; } - let payload = handle.receiver.recv().await.unwrap(); - let jsonl = String::from_utf8(payload).unwrap(); + let mut nb_tasks = 0; + while let Some(payload) = receiver.recv().await { + let payload = String::from_utf8(payload).unwrap(); + let jsonl = payload.split('\n'); + for json in jsonl { + if json.is_empty() { + break; // we reached EOF + } + nb_tasks += 1; + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + snapshot!( + json_string!(json, { ".uid" => "[uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": "[uid]", + "indexUid": "tamo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + } + if nb_tasks == 5 { + break; + } + } - snapshot!(jsonl, - @r###" - {"uid":0,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.027444S","enqueuedAt":"2023-11-28T14:05:37.767678Z","startedAt":"2023-11-28T14:05:37.769519Z","finishedAt":"2023-11-28T14:05:37.796963Z"} - "###); + assert!(nb_tasks == 5, "We should have received the 5 tasks but only received {nb_tasks}"); - let payload = handle.receiver.recv().await.unwrap(); - let jsonl = String::from_utf8(payload).unwrap(); - - snapshot!(jsonl, - @r###" - {"uid":1,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.773731Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} - {"uid":2,"indexUid":"tamo","status":"succeeded","type":"documentAdditionOrUpdate","canceledBy":null,"details":{"receivedDocuments":1,"indexedDocuments":1},"error":null,"duration":"PT0.020221S","enqueuedAt":"2023-11-28T14:05:37.780466Z","startedAt":"2023-11-28T14:05:37.799448Z","finishedAt":"2023-11-28T14:05:37.819669Z"} - "###); + server_handle.abort(); } From 547379abb018af63e21a550384d19cf2c1fbb033 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Nov 2023 16:28:11 +0100 Subject: [PATCH 10/41] parse the url correctly --- Cargo.lock | 18 ++++++++++-------- meilisearch/Cargo.toml | 1 + meilisearch/src/lib.rs | 2 +- meilisearch/src/option.rs | 9 +++++++-- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ca7eb715f..ab566e836 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1704,9 +1704,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -2756,9 +2756,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -3563,6 +3563,7 @@ dependencies = [ "tokio", "tokio-stream", "toml", + "url", "urlencoding", "uuid 1.5.0", "vergen", @@ -4071,9 +4072,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" @@ -5603,13 +5604,14 @@ dependencies = [ [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index c59b38fa6..434a488f5 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,6 +104,7 @@ walkdir = "2.3.3" yaup = "0.2.1" serde_urlencoded = "0.7.1" termcolor = "1.2.0" +url = { version = "2.5.0", features = ["serde"] } [dev-dependencies] actix-rt = "2.8.0" diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 14a1c5b45..3698e5da4 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -228,7 +228,7 @@ fn open_or_create_database_unchecked( indexes_path: opt.db_path.join("indexes"), snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), - webhook_url: opt.task_webhook_url.clone(), + webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 37472e174..e1f16d888 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -21,6 +21,7 @@ use rustls::RootCertStore; use rustls_pemfile::{certs, pkcs8_private_keys, rsa_private_keys}; use serde::{Deserialize, Serialize}; use sysinfo::{RefreshKind, System, SystemExt}; +use url::Url; const POSSIBLE_ENV: [&str; 2] = ["development", "production"]; @@ -69,6 +70,10 @@ const MEILI_MAX_INDEXING_MEMORY: &str = "MEILI_MAX_INDEXING_MEMORY"; const MEILI_MAX_INDEXING_THREADS: &str = "MEILI_MAX_INDEXING_THREADS"; const DEFAULT_LOG_EVERY_N: usize = 100_000; +fn parse_url(s: &str) -> Result { + Url::parse(s) +} + // Each environment (index and task-db) is taking space in the virtual address space. // Ideally, indexes can occupy 2TiB each to avoid having to manually resize them. // The actual size of the virtual address space is computed at startup to determine how many 2TiB indexes can be @@ -159,7 +164,7 @@ pub struct Opt { /// Called whenever a task finishes so a third party can be notified. #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] - pub task_webhook_url: Option, + pub task_webhook_url: Option, /// Deactivates Meilisearch's built-in telemetry when provided. /// @@ -416,7 +421,7 @@ impl Opt { } export_to_env_if_not_present(MEILI_ENV, env); if let Some(task_webhook_url) = task_webhook_url { - export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url); + export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string()); } #[cfg(feature = "analytics")] From be72326c0a656841e3e5d051284f8997abe2b065 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 13:09:04 +0100 Subject: [PATCH 11/41] gzip the tasks --- Cargo.lock | 5 +++-- index-scheduler/Cargo.toml | 1 + index-scheduler/src/lib.rs | 7 ++++++- meilisearch/tests/tasks/webhook.rs | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab566e836..e632d8621 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1680,9 +1680,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.26" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -2777,6 +2777,7 @@ dependencies = [ "dump", "enum-iterator", "file-store", + "flate2", "insta", "log", "meili-snap", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 4d6e4ffd0..a8c19f435 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -18,6 +18,7 @@ derive_builder = "0.12.0" dump = { path = "../dump" } enum-iterator = "1.4.0" file-store = { path = "../file-store" } +flate2 = "1.0.28" log = "0.4.17" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5d44ed104..bfaca3126 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -45,6 +45,8 @@ use dump::{KindDump, TaskDump, UpdateFile}; pub use error::Error; pub use features::RoFeatures; use file_store::FileStore; +use flate2::bufread::GzEncoder; +use flate2::Compression; use meilisearch_types::error::ResponseError; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; @@ -1288,7 +1290,10 @@ impl IndexScheduler { buffer.push(b'\n'); } - let _ = ureq::post(url).send_bytes(&buffer).unwrap(); + let reader = GzEncoder::new(&buffer[..], Compression::default()); + if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { + log::error!("While sending data to the webhook: {e}"); + } } Ok(()) diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index e852839ec..688d35e8b 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -11,6 +11,7 @@ use actix_web::{post, App, HttpResponse, HttpServer}; use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; +use url::Url; use crate::common::{default_settings, Server}; use crate::json; @@ -66,7 +67,7 @@ async fn test_basic_webhook() { let db_path = tempfile::tempdir().unwrap(); let server = Server::new_with_options(Opt { - task_webhook_url: Some(url), + task_webhook_url: Some(Url::parse(&url).unwrap()), ..default_settings(db_path.path()) }) .await From c83a33017e5224b4f7dfa713201b80b1070b69bc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 14:27:50 +0100 Subject: [PATCH 12/41] stream and chunk the data --- index-scheduler/src/lib.rs | 63 +++++++++++++++++++++++++----- meilisearch/tests/tasks/webhook.rs | 2 +- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index bfaca3126..b5b061a50 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -34,6 +34,7 @@ pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; use std::fs::File; +use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicBool; @@ -1279,18 +1280,60 @@ impl IndexScheduler { /// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one. fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> { if let Some(ref url) = self.webhook_url { - let rtxn = self.env.read_txn()?; - - // on average a task takes ~50 bytes - let mut buffer = Vec::with_capacity(updated.len() as usize * 50); - - for id in updated { - let task = self.get_task(&rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - let _ = serde_json::to_writer(&mut buffer, &TaskView::from_task(&task)); - buffer.push(b'\n'); + struct TaskReader<'a, 'b> { + rtxn: &'a RoTxn<'a>, + index_scheduler: &'a IndexScheduler, + tasks: &'b mut roaring::bitmap::Iter<'b>, + buffer: Vec, + written: usize, } - let reader = GzEncoder::new(&buffer[..], Compression::default()); + impl<'a, 'b> Read for TaskReader<'a, 'b> { + fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { + if self.buffer.is_empty() { + match self.tasks.next() { + None => return Ok(0), + Some(task_id) => { + let task = self + .index_scheduler + .get_task(self.rtxn, task_id) + .map_err(io::Error::other)? + .ok_or_else(|| io::Error::other(Error::CorruptedTaskQueue))?; + + serde_json::to_writer( + &mut self.buffer, + &TaskView::from_task(&task), + )?; + self.buffer.push(b'\n'); + } + } + } + + let mut to_write = &self.buffer[self.written..]; + let wrote = io::copy(&mut to_write, &mut buf)?; + self.written += wrote as usize; + + // we wrote everything and must refresh our buffer on the next call + if self.written == self.buffer.len() { + self.written = 0; + self.buffer.clear(); + } + + Ok(wrote as usize) + } + } + + let rtxn = self.env.read_txn()?; + + let task_reader = TaskReader { + rtxn: &rtxn, + index_scheduler: self, + tasks: &mut updated.into_iter(), + buffer: Vec::with_capacity(50), // on average a task is around ~100 bytes + written: 0, + }; + + let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { log::error!("While sending data to the webhook: {e}"); } diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index 688d35e8b..6979ff294 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -45,7 +45,7 @@ struct WebhookHandle { async fn create_webhook_server() -> WebhookHandle { let mut log_builder = env_logger::Builder::new(); - log_builder.parse_filters("debug"); + log_builder.parse_filters("info"); log_builder.init(); let (sender, receiver) = mpsc::unbounded_channel(); From 4fb25b878240ab0bc5034c6b25a6cc282884652a Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Nov 2023 14:51:47 +0100 Subject: [PATCH 13/41] fix clippy --- index-scheduler/src/lib.rs | 9 +++++++-- meilisearch/src/option.rs | 4 ---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b5b061a50..2ad263ca4 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1297,8 +1297,13 @@ impl IndexScheduler { let task = self .index_scheduler .get_task(self.rtxn, task_id) - .map_err(io::Error::other)? - .ok_or_else(|| io::Error::other(Error::CorruptedTaskQueue))?; + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + Error::CorruptedTaskQueue, + ) + })?; serde_json::to_writer( &mut self.buffer, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index e1f16d888..abb2bab6c 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -70,10 +70,6 @@ const MEILI_MAX_INDEXING_MEMORY: &str = "MEILI_MAX_INDEXING_MEMORY"; const MEILI_MAX_INDEXING_THREADS: &str = "MEILI_MAX_INDEXING_THREADS"; const DEFAULT_LOG_EVERY_N: usize = 100_000; -fn parse_url(s: &str) -> Result { - Url::parse(s) -} - // Each environment (index and task-db) is taking space in the virtual address space. // Ideally, indexes can occupy 2TiB each to avoid having to manually resize them. // The actual size of the virtual address space is computed at startup to determine how many 2TiB indexes can be From 19736cefe858377c4c6143daa05479361b87ad8e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Dec 2023 10:38:01 +0100 Subject: [PATCH 14/41] add the analytics --- meilisearch/src/analytics/segment_analytics.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 6a617577b..0cdb18540 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -264,6 +264,7 @@ struct Infos { ignore_snapshot_if_db_exists: bool, http_addr: bool, http_payload_size_limit: Byte, + task_queue_webhook: bool, log_level: String, max_indexing_memory: MaxMemory, max_indexing_threads: MaxThreads, @@ -290,7 +291,7 @@ impl From for Infos { http_addr, master_key: _, env, - task_webhook_url: _, + task_webhook_url, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -344,6 +345,7 @@ impl From for Infos { http_addr: http_addr != default_http_addr(), http_payload_size_limit, experimental_max_number_of_batched_tasks, + task_queue_webhook: task_webhook_url.is_some(), log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, From fa2b96b9a5e80fc5ba18a5d78d30113ca0ee6d37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 19 Dec 2023 12:18:45 +0100 Subject: [PATCH 15/41] Add an Authorization Header along with the webhook calls --- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 17 ++++++++++++++++- meilisearch/src/analytics/segment_analytics.rs | 3 +++ meilisearch/src/lib.rs | 1 + meilisearch/src/option.rs | 12 ++++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 9261bf66d..0adda43ff 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -38,6 +38,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { auth_path: _, version_file_path: _, webhook_url: _, + webhook_authorization_header: _, test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 2ad263ca4..296f8add1 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -245,7 +245,10 @@ pub struct IndexSchedulerOptions { pub snapshots_path: PathBuf, /// The path to the folder containing the dumps. pub dumps_path: PathBuf, + /// The URL on which we must send the tasks statuses pub webhook_url: Option, + /// The value we will send into the Authorization HTTP header on the webhook URL + pub webhook_authorization_header: Option, /// The maximum size, in bytes, of the task index. pub task_db_size: usize, /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. @@ -330,6 +333,8 @@ pub struct IndexScheduler { /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, + /// The Authorization header to send to the webhook URL. + pub(crate) webhook_authorization_header: Option, /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -397,6 +402,7 @@ impl IndexScheduler { auth_path: self.auth_path.clone(), version_file_path: self.version_file_path.clone(), webhook_url: self.webhook_url.clone(), + webhook_authorization_header: self.webhook_authorization_header.clone(), currently_updating_index: self.currently_updating_index.clone(), embedders: self.embedders.clone(), #[cfg(test)] @@ -497,6 +503,7 @@ impl IndexScheduler { auth_path: options.auth_path, version_file_path: options.version_file_path, webhook_url: options.webhook_url, + webhook_authorization_header: options.webhook_authorization_header, currently_updating_index: Arc::new(RwLock::new(None)), embedders: Default::default(), @@ -1338,8 +1345,15 @@ impl IndexScheduler { written: 0, }; + // let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); - if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) { + let request = ureq::post(url).set("Content-Encoding", "gzip"); + let request = match &self.webhook_authorization_header { + Some(header) => request.set("Authorization", header), + None => request, + }; + + if let Err(e) = request.send(reader) { log::error!("While sending data to the webhook: {e}"); } } @@ -1761,6 +1775,7 @@ mod tests { snapshots_path: tempdir.path().join("snapshots"), dumps_path: tempdir.path().join("dumps"), webhook_url: None, + webhook_authorization_header: None, task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. enable_mdb_writemap: false, diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 0cdb18540..86a5eddb9 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -265,6 +265,7 @@ struct Infos { http_addr: bool, http_payload_size_limit: Byte, task_queue_webhook: bool, + task_webhook_authorization_header: bool, log_level: String, max_indexing_memory: MaxMemory, max_indexing_threads: MaxThreads, @@ -292,6 +293,7 @@ impl From for Infos { master_key: _, env, task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -346,6 +348,7 @@ impl From for Infos { http_payload_size_limit, experimental_max_number_of_batched_tasks, task_queue_webhook: task_webhook_url.is_some(), + task_webhook_authorization_header: task_webhook_authorization_header.is_some(), log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 3698e5da4..f1111962c 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -229,6 +229,7 @@ fn open_or_create_database_unchecked( snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), + webhook_authorization_header: opt.task_webhook_authorization_header.clone(), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index abb2bab6c..a0672c9cf 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -30,6 +30,7 @@ const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR"; const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; +const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; #[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; @@ -162,6 +163,10 @@ pub struct Opt { #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] pub task_webhook_url: Option, + /// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)] + pub task_webhook_authorization_header: Option, + /// Deactivates Meilisearch's built-in telemetry when provided. /// /// Meilisearch automatically collects data from all instances that do not opt out using this flag. @@ -382,6 +387,7 @@ impl Opt { master_key, env, task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -419,6 +425,12 @@ impl Opt { if let Some(task_webhook_url) = task_webhook_url { export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string()); } + if let Some(task_webhook_authorization_header) = task_webhook_authorization_header { + export_to_env_if_not_present( + MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER, + task_webhook_authorization_header, + ); + } #[cfg(feature = "analytics")] { From 333ce12eb2dcfa947c464934c3dfda59c38f2d6e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 10:17:49 +0100 Subject: [PATCH 16/41] Fixed issue where the default revision is always the one we picked for the default model --- milli/src/vector/settings.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 912cdf953..1826c040d 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -228,6 +228,12 @@ impl From for crate::vector::hf::EmbedderOptions { let mut this = Self::default(); if let Some(model) = model.set() { this.model = model; + // Reset the revision if we are setting the model. + // This allows the following: + // "huggingFace": {} -> default model with default revision + // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision + // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision + this.revision = None; } if let Some(revision) = revision.set() { this.revision = Some(revision); From e249e4db7bb7bb874dffb79ca1161f8448065c2d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:07:18 +0100 Subject: [PATCH 17/41] Change Setting::apply function signature --- milli/src/update/settings.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index d406c121c..e47a5ad52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -78,11 +78,19 @@ impl Setting { } } - pub fn apply(&mut self, new: Self) { + /// Returns `true` if applying the new setting changed this setting + pub fn apply(&mut self, new: Self) -> bool + where + T: PartialEq + Eq, + { if let Setting::NotSet = new { - return; + return false; + } + if self == &new { + return false; } *self = new; + true } } From 393216bf30b2942122b9228abd392a27b55a880e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:08:32 +0100 Subject: [PATCH 18/41] Flatten embedders settings --- milli/src/vector/openai.rs | 9 +- milli/src/vector/settings.rs | 464 ++++++++++++++++------------------- 2 files changed, 213 insertions(+), 260 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index c11e6ddc6..53e8a041b 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -34,6 +34,9 @@ pub struct EmbedderOptions { #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub enum EmbeddingModel { + // # WARNING + // + // If ever adding a model, make sure to add it to the list of supported models below. #[default] #[serde(rename = "text-embedding-ada-002")] #[deserr(rename = "text-embedding-ada-002")] @@ -41,6 +44,10 @@ pub enum EmbeddingModel { } impl EmbeddingModel { + pub fn supported_models() -> &'static [&'static str] { + &["text-embedding-ada-002"] + } + pub fn max_token(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 8191, @@ -59,7 +66,7 @@ impl EmbeddingModel { } } - pub fn from_name(name: &'static str) -> Option { + pub fn from_name(name: &str) -> Option { match name { "text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002), _ => None, diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 1826c040d..945fc62c0 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -4,32 +4,189 @@ use serde::{Deserialize, Serialize}; use crate::prompt::PromptData; use crate::update::Setting; use crate::vector::EmbeddingConfig; +use crate::UserError; #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct EmbeddingSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "source")] - #[deserr(default, rename = "source")] - pub embedder_options: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] - pub document_template: Setting, + pub source: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub model: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub revision: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub api_key: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub dimensions: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub document_template: Setting, +} + +pub fn check_unset( + key: &Setting, + field: &'static str, + source: EmbedderSource, + embedder_name: &str, +) -> Result<(), UserError> { + if matches!(key, Setting::NotSet) { + Ok(()) + } else { + Err(UserError::InvalidFieldForSource { + embedder_name: embedder_name.to_owned(), + source_: source, + field, + allowed_fields_for_source: EmbeddingSettings::allowed_fields_for_source(source), + allowed_sources_for_field: EmbeddingSettings::allowed_sources_for_field(field), + }) + } +} + +pub fn check_set( + key: &Setting, + field: &'static str, + source: EmbedderSource, + embedder_name: &str, +) -> Result<(), UserError> { + if matches!(key, Setting::Set(_)) { + Ok(()) + } else { + Err(UserError::MissingFieldForSource { + field, + source_: source, + embedder_name: embedder_name.to_owned(), + }) + } +} + +impl EmbeddingSettings { + pub const SOURCE: &str = "source"; + pub const MODEL: &str = "model"; + pub const REVISION: &str = "revision"; + pub const API_KEY: &str = "apiKey"; + pub const DIMENSIONS: &str = "dimensions"; + pub const DOCUMENT_TEMPLATE: &str = "documentTemplate"; + + pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { + match field { + Self::SOURCE => { + &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::UserProvided] + } + Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + Self::REVISION => &[EmbedderSource::HuggingFace], + Self::API_KEY => &[EmbedderSource::OpenAi], + Self::DIMENSIONS => &[EmbedderSource::UserProvided], + Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + _other => unreachable!("unknown field"), + } + } + + pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] { + match source { + EmbedderSource::OpenAi => { + &[Self::SOURCE, Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE] + } + EmbedderSource::HuggingFace => { + &[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE] + } + EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS], + } + } + + pub(crate) fn apply_default_source(setting: &mut Setting) { + if let Setting::Set(EmbeddingSettings { + source: source @ (Setting::NotSet | Setting::Reset), + .. + }) = setting + { + *source = Setting::Set(EmbedderSource::default()) + } + } +} + +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub enum EmbedderSource { + #[default] + OpenAi, + HuggingFace, + UserProvided, +} + +impl std::fmt::Display for EmbedderSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + EmbedderSource::OpenAi => "openAi", + EmbedderSource::HuggingFace => "huggingFace", + EmbedderSource::UserProvided => "userProvided", + }; + f.write_str(s) + } } impl EmbeddingSettings { pub fn apply(&mut self, new: Self) { - let EmbeddingSettings { embedder_options, document_template: prompt } = new; - self.embedder_options.apply(embedder_options); - self.document_template.apply(prompt); + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + new; + let old_source = self.source; + self.source.apply(source); + // Reinitialize the whole setting object on a source change + if old_source != self.source { + *self = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + }; + return; + } + + self.model.apply(model); + self.revision.apply(revision); + self.api_key.apply(api_key); + self.dimensions.apply(dimensions); + self.document_template.apply(document_template); } } impl From for EmbeddingSettings { fn from(value: EmbeddingConfig) -> Self { - Self { - embedder_options: Setting::Set(value.embedder_options.into()), - document_template: Setting::Set(value.prompt.into()), + let EmbeddingConfig { embedder_options, prompt } = value; + match embedder_options { + super::EmbedderOptions::HuggingFace(options) => Self { + source: Setting::Set(EmbedderSource::HuggingFace), + model: Setting::Set(options.model), + revision: options.revision.map(Setting::Set).unwrap_or_default(), + api_key: Setting::NotSet, + dimensions: Setting::NotSet, + document_template: Setting::Set(prompt.template), + }, + super::EmbedderOptions::OpenAi(options) => Self { + source: Setting::Set(EmbedderSource::OpenAi), + model: Setting::Set(options.embedding_model.name().to_owned()), + revision: Setting::NotSet, + api_key: options.api_key.map(Setting::Set).unwrap_or_default(), + dimensions: Setting::NotSet, + document_template: Setting::Set(prompt.template), + }, + super::EmbedderOptions::UserProvided(options) => Self { + source: Setting::Set(EmbedderSource::UserProvided), + model: Setting::NotSet, + revision: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(options.dimensions), + document_template: Setting::NotSet, + }, } } } @@ -37,262 +194,51 @@ impl From for EmbeddingSettings { impl From for EmbeddingConfig { fn from(value: EmbeddingSettings) -> Self { let mut this = Self::default(); - let EmbeddingSettings { embedder_options, document_template: prompt } = value; - if let Some(embedder_options) = embedder_options.set() { - this.embedder_options = embedder_options.into(); - } - if let Some(prompt) = prompt.set() { - this.prompt = prompt.into(); - } - this - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct PromptSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub template: Setting, -} - -impl PromptSettings { - pub fn apply(&mut self, new: Self) { - let PromptSettings { template } = new; - self.template.apply(template); - } -} - -impl From for PromptSettings { - fn from(value: PromptData) -> Self { - Self { template: Setting::Set(value.template) } - } -} - -impl From for PromptData { - fn from(value: PromptSettings) -> Self { - let mut this = PromptData::default(); - let PromptSettings { template } = value; - if let Some(template) = template.set() { - this.template = template; - } - this - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -pub enum EmbedderSettings { - HuggingFace(Setting), - OpenAi(Setting), - UserProvided(UserProvidedSettings), -} - -impl Deserr for EmbedderSettings -where - E: deserr::DeserializeError, -{ - fn deserialize_from_value( - value: deserr::Value, - location: deserr::ValuePointerRef, - ) -> Result { - match value { - deserr::Value::Map(map) => { - if deserr::Map::len(&map) != 1 { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "Expected a single field, got {} fields", - deserr::Map::len(&map) - ), - }, - location, - ))); + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + value; + if let Some(source) = source.set() { + match source { + EmbedderSource::OpenAi => { + let mut options = super::openai::EmbedderOptions::with_default_model(None); + if let Some(model) = model.set() { + if let Some(model) = super::openai::EmbeddingModel::from_name(&model) { + options.embedding_model = model; + } + } + if let Some(api_key) = api_key.set() { + options.api_key = Some(api_key); + } + this.embedder_options = super::EmbedderOptions::OpenAi(options); } - let mut it = deserr::Map::into_iter(map); - let (k, v) = it.next().unwrap(); - - match k.as_str() { - "huggingFace" => Ok(EmbedderSettings::HuggingFace(Setting::Set( - HfEmbedderSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - ))), - "openAi" => Ok(EmbedderSettings::OpenAi(Setting::Set( - OpenAiEmbedderSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - ))), - "userProvided" => Ok(EmbedderSettings::UserProvided( - UserProvidedSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - )), - other => Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::UnknownKey { - key: other, - accepted: &["huggingFace", "openAi", "userProvided"], - }, - location, - ))), + EmbedderSource::HuggingFace => { + let mut options = super::hf::EmbedderOptions::default(); + if let Some(model) = model.set() { + options.model = model; + // Reset the revision if we are setting the model. + // This allows the following: + // "huggingFace": {} -> default model with default revision + // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision + // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision + options.revision = None; + } + if let Some(revision) = revision.set() { + options.revision = Some(revision); + } + this.embedder_options = super::EmbedderOptions::HuggingFace(options); + } + EmbedderSource::UserProvided => { + this.embedder_options = + super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { + dimensions: dimensions.set().unwrap(), + }); } } - _ => Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::IncorrectValueKind { - actual: value, - accepted: &[deserr::ValueKind::Map], - }, - location, - ))), } - } -} -impl Default for EmbedderSettings { - fn default() -> Self { - Self::OpenAi(Default::default()) - } -} - -impl From for EmbedderSettings { - fn from(value: crate::vector::EmbedderOptions) -> Self { - match value { - crate::vector::EmbedderOptions::HuggingFace(hf) => { - Self::HuggingFace(Setting::Set(hf.into())) - } - crate::vector::EmbedderOptions::OpenAi(openai) => { - Self::OpenAi(Setting::Set(openai.into())) - } - crate::vector::EmbedderOptions::UserProvided(user_provided) => { - Self::UserProvided(user_provided.into()) - } + if let Setting::Set(template) = document_template { + this.prompt = PromptData { template } } - } -} -impl From for crate::vector::EmbedderOptions { - fn from(value: EmbedderSettings) -> Self { - match value { - EmbedderSettings::HuggingFace(Setting::Set(hf)) => Self::HuggingFace(hf.into()), - EmbedderSettings::HuggingFace(_setting) => Self::HuggingFace(Default::default()), - EmbedderSettings::OpenAi(Setting::Set(ai)) => Self::OpenAi(ai.into()), - EmbedderSettings::OpenAi(_setting) => { - Self::OpenAi(crate::vector::openai::EmbedderOptions::with_default_model(None)) - } - EmbedderSettings::UserProvided(user_provided) => { - Self::UserProvided(user_provided.into()) - } - } - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct HfEmbedderSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub model: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub revision: Setting, -} - -impl HfEmbedderSettings { - pub fn apply(&mut self, new: Self) { - let HfEmbedderSettings { model, revision } = new; - self.model.apply(model); - self.revision.apply(revision); - } -} - -impl From for HfEmbedderSettings { - fn from(value: crate::vector::hf::EmbedderOptions) -> Self { - Self { - model: Setting::Set(value.model), - revision: value.revision.map(Setting::Set).unwrap_or(Setting::NotSet), - } - } -} - -impl From for crate::vector::hf::EmbedderOptions { - fn from(value: HfEmbedderSettings) -> Self { - let HfEmbedderSettings { model, revision } = value; - let mut this = Self::default(); - if let Some(model) = model.set() { - this.model = model; - // Reset the revision if we are setting the model. - // This allows the following: - // "huggingFace": {} -> default model with default revision - // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision - // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision - this.revision = None; - } - if let Some(revision) = revision.set() { - this.revision = Some(revision); - } this } } - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct OpenAiEmbedderSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub api_key: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "model")] - #[deserr(default, rename = "model")] - pub embedding_model: Setting, -} - -impl OpenAiEmbedderSettings { - pub fn apply(&mut self, new: Self) { - let Self { api_key, embedding_model: embedding_mode } = new; - self.api_key.apply(api_key); - self.embedding_model.apply(embedding_mode); - } -} - -impl From for OpenAiEmbedderSettings { - fn from(value: crate::vector::openai::EmbedderOptions) -> Self { - Self { - api_key: value.api_key.map(Setting::Set).unwrap_or(Setting::Reset), - embedding_model: Setting::Set(value.embedding_model), - } - } -} - -impl From for crate::vector::openai::EmbedderOptions { - fn from(value: OpenAiEmbedderSettings) -> Self { - let OpenAiEmbedderSettings { api_key, embedding_model } = value; - Self { api_key: api_key.set(), embedding_model: embedding_model.set().unwrap_or_default() } - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct UserProvidedSettings { - pub dimensions: usize, -} - -impl From for crate::vector::manual::EmbedderOptions { - fn from(value: UserProvidedSettings) -> Self { - Self { dimensions: value.dimensions } - } -} - -impl From for UserProvidedSettings { - fn from(value: crate::vector::manual::EmbedderOptions) -> Self { - Self { dimensions: value.dimensions } - } -} From 14b396d302e04e0cd4758081097cadfdf2e2c1a0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:01:22 +0100 Subject: [PATCH 19/41] Add new errors --- meilisearch-types/src/error.rs | 5 ++++- milli/src/error.rs | 29 ++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 62591e991..2182b1836 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -344,7 +344,10 @@ impl ErrorCode for milli::Error { Code::InvalidDocumentId } UserError::MissingDocumentField(_) => Code::InvalidDocumentFields, - UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, + UserError::InvalidFieldForSource { .. } + | UserError::MissingFieldForSource { .. } + | UserError::InvalidOpenAiModel { .. } + | UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, diff --git a/milli/src/error.rs b/milli/src/error.rs index 9c5d8f416..539861e73 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -192,7 +192,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentField(#[from] crate::prompt::error::RenderPromptError), #[error(transparent)] InvalidPrompt(#[from] crate::prompt::error::NewPromptError), - #[error("Invalid prompt in for embeddings with name '{0}': {1}.")] + #[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")] InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), @@ -200,6 +200,33 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidEmbedder(String), #[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")] TooManyVectors(String, usize), + #[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}", + allowed_sources_for_field + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", "), + allowed_fields_for_source + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", ") + )] + InvalidFieldForSource { + embedder_name: String, + source_: crate::vector::settings::EmbedderSource, + field: &'static str, + allowed_fields_for_source: &'static [&'static str], + allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource], + }, + #[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())] + InvalidOpenAiModel { embedder_name: String, model: String }, + #[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")] + MissingFieldForSource { + field: &'static str, + source_: crate::vector::settings::EmbedderSource, + embedder_name: String, + }, } impl From for Error { From 9123370e904beb9a96f31ba499604de29aa41d75 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:12:34 +0100 Subject: [PATCH 20/41] Validate fused settings in settings task after fusing with existing setting --- milli/src/update/mod.rs | 2 +- milli/src/update/settings.rs | 89 +++++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 13 deletions(-) diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index eb2b6e69a..66c52a52f 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,7 +8,7 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::settings::{Setting, Settings}; +pub use self::settings::{validate_embedding_settings, Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index e47a5ad52..d770bcd74 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -17,7 +17,7 @@ use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::vector::settings::{EmbeddingSettings, PromptSettings}; +use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldsIdsMap, Index, OrderBy, Result}; @@ -958,17 +958,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) { match joined { + // updated config EitherOrBoth::Both((name, mut old), (_, new)) => { - old.apply(new); - let new = validate_prompt(&name, old)?; - changed = true; + changed |= old.apply(new); + let new = validate_embedding_settings(old, &name)?; new_configs.insert(name, new); } + // unchanged config EitherOrBoth::Left((name, setting)) => { new_configs.insert(name, setting); } - EitherOrBoth::Right((name, setting)) => { - let setting = validate_prompt(&name, setting)?; + // new config + EitherOrBoth::Right((name, mut setting)) => { + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; changed = true; new_configs.insert(name, setting); } @@ -1080,8 +1086,12 @@ fn validate_prompt( ) -> Result> { match new { Setting::Set(EmbeddingSettings { - embedder_options, - document_template: Setting::Set(PromptSettings { template: Setting::Set(template) }), + source, + model, + revision, + api_key, + dimensions, + document_template: Setting::Set(template), }) => { // validate let template = crate::prompt::Prompt::new(template) @@ -1089,16 +1099,71 @@ fn validate_prompt( .map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?; Ok(Setting::Set(EmbeddingSettings { - embedder_options, - document_template: Setting::Set(PromptSettings { - template: Setting::Set(template), - }), + source, + model, + revision, + api_key, + dimensions, + document_template: Setting::Set(template), })) } new => Ok(new), } } +pub fn validate_embedding_settings( + settings: Setting, + name: &str, +) -> Result> { + let settings = validate_prompt(name, settings)?; + let Setting::Set(settings) = settings else { return Ok(settings) }; + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + settings; + let Some(inferred_source) = source.set() else { + return Ok(Setting::Set(EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + })); + }; + match inferred_source { + EmbedderSource::OpenAi => { + check_unset(&revision, "revision", inferred_source, name)?; + check_unset(&dimensions, "dimensions", inferred_source, name)?; + if let Setting::Set(model) = &model { + crate::vector::openai::EmbeddingModel::from_name(model.as_str()).ok_or( + crate::error::UserError::InvalidOpenAiModel { + embedder_name: name.to_owned(), + model: model.clone(), + }, + )?; + } + } + EmbedderSource::HuggingFace => { + check_unset(&api_key, "apiKey", inferred_source, name)?; + check_unset(&dimensions, "dimensions", inferred_source, name)?; + } + EmbedderSource::UserProvided => { + check_unset(&model, "model", inferred_source, name)?; + check_unset(&revision, "revision", inferred_source, name)?; + check_unset(&api_key, "apiKey", inferred_source, name)?; + check_unset(&document_template, "documentTemplate", inferred_source, name)?; + check_set(&dimensions, "dimensions", inferred_source, name)?; + } + } + Ok(Setting::Set(EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + })) +} + #[cfg(test)] mod tests { use big_s::S; From ec9649c922dcee54f2320e9a5e2f0b029063e773 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:13:27 +0100 Subject: [PATCH 21/41] Add function to validate settings in Meilisearch, to be used in the routes --- meilisearch-types/src/settings.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index b0dee69a3..244fbffa2 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -318,6 +318,21 @@ impl Settings { _kind: PhantomData, } } + + pub fn validate(self) -> Result { + self.validate_embedding_settings() + } + + fn validate_embedding_settings(mut self) -> Result { + let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; + for (name, config) in configs.iter_mut() { + let config_to_check = std::mem::take(config); + let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?; + *config = checked_config + } + self.embedders = Setting::Set(configs); + Ok(self) + } } #[derive(Debug, Clone, Serialize, Deserialize)] From 2e4c9651dfb973be86a053f1971a30170c752736 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:13:55 +0100 Subject: [PATCH 22/41] Validate settings in route --- meilisearch/src/routes/indexes/settings.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 290cab2e0..feb174a1b 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -90,6 +90,8 @@ macro_rules! make_setting_route { ..Default::default() }; + let new_settings = new_settings.validate()?; + let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -582,13 +584,13 @@ fn embedder_analytics( for source in s .values() .filter_map(|config| config.clone().set()) - .filter_map(|config| config.embedder_options.set()) + .filter_map(|config| config.source.set()) { - use meilisearch_types::milli::vector::settings::EmbedderSettings; + use meilisearch_types::milli::vector::settings::EmbedderSource; match source { - EmbedderSettings::OpenAi(_) => sources.insert("openAi"), - EmbedderSettings::HuggingFace(_) => sources.insert("huggingFace"), - EmbedderSettings::UserProvided(_) => sources.insert("userProvided"), + EmbedderSource::OpenAi => sources.insert("openAi"), + EmbedderSource::HuggingFace => sources.insert("huggingFace"), + EmbedderSource::UserProvided => sources.insert("userProvided"), }; } }; @@ -651,6 +653,7 @@ pub async fn update_all( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let new_settings = body.into_inner(); + let new_settings = new_settings.validate()?; analytics.publish( "Settings Updated".to_string(), From 6ff81de401608cbcfba3844e7d0a01a9c80ab82a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:06:50 +0100 Subject: [PATCH 23/41] Fix tests --- meilisearch/tests/search/hybrid.rs | 6 +++--- meilisearch/tests/search/mod.rs | 14 +++++++++++--- milli/src/update/index_documents/mod.rs | 18 ++++++++++-------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index fb6fe297f..77a29d4a3 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -21,9 +21,9 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde "###); let (response, code) = index - .update_settings( - json!({ "embedders": {"default": {"source": {"userProvided": {"dimensions": 2}}}} }), - ) + .update_settings(json!({ "embedders": {"default": { + "source": "userProvided", + "dimensions": 2}}} )) .await; assert_eq!(202, code, "{:?}", response); index.wait_task(response.uid()).await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 133a143fd..9b7b01029 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -890,13 +890,21 @@ async fn experimental_feature_vector_store() { let (response, code) = index .update_settings(json!({"embedders": { "manual": { - "source": { - "userProvided": {"dimensions": 3} - } + "source": "userProvided", + "dimensions": 3, } }})) .await; + meili_snap::snapshot!(response, @r###" + { + "taskUid": 1, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); meili_snap::snapshot!(code, @"202 Accepted"); let response = index.wait_task(response.uid()).await; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ffc3f6b3a..738cfeb38 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2553,7 +2553,7 @@ mod tests { /// Vectors must be of the same length. #[test] fn test_multiple_vectors() { - use crate::vector::settings::{EmbedderSettings, EmbeddingSettings}; + use crate::vector::settings::EmbeddingSettings; let index = TempIndex::new(); index @@ -2562,9 +2562,11 @@ mod tests { embedders.insert( "manual".to_string(), Setting::Set(EmbeddingSettings { - embedder_options: Setting::Set(EmbedderSettings::UserProvided( - crate::vector::settings::UserProvidedSettings { dimensions: 3 }, - )), + source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided), + model: Setting::NotSet, + revision: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(3), document_template: Setting::NotSet, }), ); @@ -2579,10 +2581,10 @@ mod tests { .unwrap(); index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap(); index - .add_documents( - documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]), - ) - .unwrap(); + .add_documents( + documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]), + ) + .unwrap(); let rtxn = index.read_txn().unwrap(); let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap(); From 0bf879fb889711a1d57cbd5788a6bc790ac987f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 20 Dec 2023 17:48:09 +0100 Subject: [PATCH 24/41] Fix warning on rust stable --- milli/src/vector/settings.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 945fc62c0..37fb80452 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -67,12 +67,12 @@ pub fn check_set( } impl EmbeddingSettings { - pub const SOURCE: &str = "source"; - pub const MODEL: &str = "model"; - pub const REVISION: &str = "revision"; - pub const API_KEY: &str = "apiKey"; - pub const DIMENSIONS: &str = "dimensions"; - pub const DOCUMENT_TEMPLATE: &str = "documentTemplate"; + pub const SOURCE: &'static str = "source"; + pub const MODEL: &'static str = "model"; + pub const REVISION: &'static str = "revision"; + pub const API_KEY: &'static str = "apiKey"; + pub const DIMENSIONS: &'static str = "dimensions"; + pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate"; pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { match field { From f52dee2b3b2b63f9dc1755e68e32e2053a268a91 Mon Sep 17 00:00:00 2001 From: Morgane Dubus <30866152+mdubus@users.noreply.github.com> Date: Thu, 21 Dec 2023 09:53:13 +0100 Subject: [PATCH 25/41] Update Cargo.toml Update mini-dashboard with v0.2.12 --- meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index c59b38fa6..ceadbb1f1 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -153,5 +153,5 @@ greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip" -sha1 = "83cd44ed1e5f97ecb581dc9f958a63f4ccc982d9" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.12/build.zip" +sha1 = "acfe9a018c93eb0604ea87ee87bff7df5474e18e" From ee54d3171e4d4c5b69fb093ea2aa5858a086efcb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 21 Dec 2023 15:26:12 +0100 Subject: [PATCH 26/41] Check experimental feature at query time --- index-scheduler/src/batch.rs | 3 --- meilisearch/src/routes/indexes/settings.rs | 17 +++++++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index c1c7af7ac..f6ed8392c 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1351,9 +1351,6 @@ impl IndexScheduler { for (task, (_, settings)) in tasks.iter_mut().zip(settings) { let checked_settings = settings.clone().check(); - if matches!(checked_settings.embedders, milli::update::Setting::Set(_)) { - self.features().check_vector("Passing `embedders` in settings")? - } task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) }); apply_settings_to_builder(&checked_settings, &mut builder); diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index feb174a1b..3ab6afe07 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -90,7 +90,10 @@ macro_rules! make_setting_route { ..Default::default() }; - let new_settings = new_settings.validate()?; + let new_settings = $crate::routes::indexes::settings::validate_settings( + new_settings, + &index_scheduler, + )?; let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -653,7 +656,7 @@ pub async fn update_all( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let new_settings = body.into_inner(); - let new_settings = new_settings.validate()?; + let new_settings = validate_settings(new_settings, &index_scheduler)?; analytics.publish( "Settings Updated".to_string(), @@ -803,3 +806,13 @@ pub async fn delete_all( debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } + +fn validate_settings( + settings: Settings, + index_scheduler: &IndexScheduler, +) -> Result, ResponseError> { + if matches!(settings.embedders, Setting::Set(_)) { + index_scheduler.features().check_vector("Passing `embedders` in settings")? + } + Ok(settings.validate()?) +} From 54ae6951eba871c0253c76992cc5030d7272903a Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jan 2024 15:19:00 +0100 Subject: [PATCH 27/41] fix warning --- milli/src/update/index_documents/helpers/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 52638d6f6..1e29c0240 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -16,7 +16,7 @@ pub use merge_functions::{ keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, + obkvs_merge_additions_and_deletions, MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 738cfeb38..e797c0380 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; -pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; +pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, From 94b9f3b310fec35ac3d215d2725e460bd5ff64af Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Jan 2024 12:45:13 +0100 Subject: [PATCH 28/41] Add test --- meilisearch/tests/search/hybrid.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 77a29d4a3..79819cab2 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -56,6 +56,15 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { }]) }); +static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { + json!([{ + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": {"default": [1.0, 3.0]}, + }]) +}); + #[actix_rt::test] async fn simple_search() { let server = Server::new().await; @@ -149,3 +158,18 @@ async fn invalid_semantic_ratio() { } "###); } + +#[actix_rt::test] +async fn single_document() { + let server = Server::new().await; + let index = index_with_documents(&server, &SINGLE_DOCUMENT).await; + + let (response, code) = index + .search_post( + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + ) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); +} From 12edc2c20a3ae0a0ead0ac301b3c51c47a803e0d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Jan 2024 15:59:37 +0100 Subject: [PATCH 29/41] Update arroy to a fixed version --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 496710399..72701288f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -383,7 +383,7 @@ dependencies = [ [[package]] name = "arroy" version = "0.1.0" -source = "git+https://github.com/meilisearch/arroy.git#4f193fd534acd357b65bfe9eec4b3fed8ece2007" +source = "git+https://github.com/meilisearch/arroy.git#d372648212e561a4845077cdb9239423d78655a2" dependencies = [ "bytemuck", "byteorder", From f75f22e0263c4e76913cae37b5e56d0fa424a267 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 8 Jan 2024 11:09:37 +0100 Subject: [PATCH 30/41] Display default value when proximityPrecision is not set --- meilisearch-types/src/settings.rs | 20 ++++++++++++++++-- meilisearch/tests/dumps/mod.rs | 24 +++++++++++----------- meilisearch/tests/settings/get_settings.rs | 1 + 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 244fbffa2..7606fa44a 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -37,6 +37,17 @@ where .serialize(s) } +fn serialize_with_default(field: &Setting, s: S) -> std::result::Result +where + T: Default + Serialize, + S: Serializer, +{ + match field { + Setting::Set(value) => value.serialize(s), + Setting::Reset | Setting::NotSet => T::default().serialize(s), + } +} + #[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)] pub struct Checked; @@ -186,7 +197,11 @@ pub struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub distinct_attribute: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[serde( + default, + serialize_with = "serialize_with_default", + skip_serializing_if = "Setting::is_not_set" + )] #[deserr(default, error = DeserrJsonError)] pub proximity_precision: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] @@ -735,10 +750,11 @@ impl From for Criterion { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub enum ProximityPrecisionView { + #[default] ByWord, ByAttribute, } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 5ad0ed247..90b9b91da 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -59,7 +59,7 @@ async fn import_dump_v1_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -221,7 +221,7 @@ async fn import_dump_v1_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -369,7 +369,7 @@ async fn import_dump_v1_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -503,7 +503,7 @@ async fn import_dump_v2_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -649,7 +649,7 @@ async fn import_dump_v2_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -794,7 +794,7 @@ async fn import_dump_v2_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -928,7 +928,7 @@ async fn import_dump_v3_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1074,7 +1074,7 @@ async fn import_dump_v3_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1219,7 +1219,7 @@ async fn import_dump_v3_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1353,7 +1353,7 @@ async fn import_dump_v4_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1499,7 +1499,7 @@ async fn import_dump_v4_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1644,7 +1644,7 @@ async fn import_dump_v4_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 9ab53c51e..79689c1b5 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -84,6 +84,7 @@ async fn get_settings() { }) ); assert_eq!(settings["embedders"], json!({})); + assert_eq!(settings["proximityPrecision"], json!("byWord")); } #[actix_rt::test] From e27b850b0989609e7e97af8023e915168db57a55 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 8 Jan 2024 14:03:47 +0100 Subject: [PATCH 31/41] move the default display strategy on setting getter function --- meilisearch-types/src/settings.rs | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 7606fa44a..9613cb9f3 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -37,17 +37,6 @@ where .serialize(s) } -fn serialize_with_default(field: &Setting, s: S) -> std::result::Result -where - T: Default + Serialize, - S: Serializer, -{ - match field { - Setting::Set(value) => value.serialize(s), - Setting::Reset | Setting::NotSet => T::default().serialize(s), - } -} - #[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)] pub struct Checked; @@ -197,11 +186,7 @@ pub struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub distinct_attribute: Setting, - #[serde( - default, - serialize_with = "serialize_with_default", - skip_serializing_if = "Setting::is_not_set" - )] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub proximity_precision: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] @@ -641,10 +626,7 @@ pub fn settings( Some(field) => Setting::Set(field), None => Setting::Reset, }, - proximity_precision: match proximity_precision { - Some(precision) => Setting::Set(precision), - None => Setting::Reset, - }, + proximity_precision: Setting::Set(proximity_precision.unwrap_or_default()), synonyms: Setting::Set(synonyms), typo_tolerance: Setting::Set(typo_tolerance), faceting: Setting::Set(faceting), From 97bb1ff9e261e0802a0c19d4977d8b0f2fe9d999 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 9 Jan 2024 15:37:27 +0100 Subject: [PATCH 32/41] Move `currently_updating_index` to IndexMapper --- index-scheduler/src/batch.rs | 4 ++-- index-scheduler/src/index_mapper/mod.rs | 17 +++++++++++++++++ index-scheduler/src/insta_snapshot.rs | 1 - index-scheduler/src/lib.rs | 15 +-------------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index f6ed8392c..97dc7a2bd 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -936,8 +936,8 @@ impl IndexScheduler { }; // the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick - *self.currently_updating_index.write().unwrap() = - Some((index_uid.clone(), index.clone())); + self.index_mapper + .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; diff --git a/index-scheduler/src/index_mapper/mod.rs b/index-scheduler/src/index_mapper/mod.rs index 18aed42b0..58ec2bf11 100644 --- a/index-scheduler/src/index_mapper/mod.rs +++ b/index-scheduler/src/index_mapper/mod.rs @@ -69,6 +69,10 @@ pub struct IndexMapper { /// Whether we open a meilisearch index with the MDB_WRITEMAP option or not. enable_mdb_writemap: bool, pub indexer_config: Arc, + + /// A few types of long running batches of tasks that act on a single index set this field + /// so that a handle to the index is available from other threads (search) in an optimized manner. + currently_updating_index: Arc>>, } /// Whether the index is available for use or is forbidden to be inserted back in the index map @@ -151,6 +155,7 @@ impl IndexMapper { index_growth_amount, enable_mdb_writemap, indexer_config: Arc::new(indexer_config), + currently_updating_index: Default::default(), }) } @@ -303,6 +308,14 @@ impl IndexMapper { /// Return an index, may open it if it wasn't already opened. pub fn index(&self, rtxn: &RoTxn, name: &str) -> Result { + if let Some((current_name, current_index)) = + self.currently_updating_index.read().unwrap().as_ref() + { + if current_name == name { + return Ok(current_index.clone()); + } + } + let uuid = self .index_mapping .get(rtxn, name)? @@ -474,4 +487,8 @@ impl IndexMapper { pub fn indexer_config(&self) -> &IndexerConfig { &self.indexer_config } + + pub fn set_currently_updating_index(&self, index: Option<(String, Index)>) { + *self.currently_updating_index.write().unwrap() = index; + } } diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 0adda43ff..42f041578 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -42,7 +42,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, - currently_updating_index: _, embedders: _, } = scheduler; diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 296f8add1..a5b0cb5b0 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -351,10 +351,6 @@ pub struct IndexScheduler { /// The path to the version file of Meilisearch. pub(crate) version_file_path: PathBuf, - /// A few types of long running batches of tasks that act on a single index set this field - /// so that a handle to the index is available from other threads (search) in an optimized manner. - currently_updating_index: Arc>>, - embedders: Arc>>>, // ================= test @@ -403,7 +399,6 @@ impl IndexScheduler { version_file_path: self.version_file_path.clone(), webhook_url: self.webhook_url.clone(), webhook_authorization_header: self.webhook_authorization_header.clone(), - currently_updating_index: self.currently_updating_index.clone(), embedders: self.embedders.clone(), #[cfg(test)] test_breakpoint_sdr: self.test_breakpoint_sdr.clone(), @@ -504,7 +499,6 @@ impl IndexScheduler { version_file_path: options.version_file_path, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, - currently_updating_index: Arc::new(RwLock::new(None)), embedders: Default::default(), #[cfg(test)] @@ -688,13 +682,6 @@ impl IndexScheduler { /// If you need to fetch information from or perform an action on all indexes, /// see the `try_for_each_index` function. pub fn index(&self, name: &str) -> Result { - if let Some((current_name, current_index)) = - self.currently_updating_index.read().unwrap().as_ref() - { - if current_name == name { - return Ok(current_index.clone()); - } - } let rtxn = self.env.read_txn()?; self.index_mapper.index(&rtxn, name) } @@ -1175,7 +1162,7 @@ impl IndexScheduler { }; // Reset the currently updating index to relinquish the index handle - *self.currently_updating_index.write().unwrap() = None; + self.index_mapper.set_currently_updating_index(None); #[cfg(test)] self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?; From 3f3462ab62abbf5d8af7621755ab2b7e87f1ea8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jan 2024 16:34:40 +0100 Subject: [PATCH 33/41] Limit the number of values returned by the facet search --- meilisearch/src/search.rs | 3 +++ meilisearch/tests/search/facet_search.rs | 18 ++++++++++++++++++ milli/src/search/mod.rs | 22 +++++++++++++++++----- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index b5dba8a58..399c7b95d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -735,6 +735,9 @@ pub fn perform_facet_search( if let Some(facet_query) = &facet_query { facet_search.query(facet_query); } + if let Some(max_facets) = index.max_values_per_facet(&rtxn)? { + facet_search.max_values(max_facets as usize); + } Ok(FacetSearchResult { facet_hits: facet_search.execute()?, diff --git a/meilisearch/tests/search/facet_search.rs b/meilisearch/tests/search/facet_search.rs index 8c1229f1a..5f9f631f9 100644 --- a/meilisearch/tests/search/facet_search.rs +++ b/meilisearch/tests/search/facet_search.rs @@ -105,6 +105,24 @@ async fn more_advanced_facet_search() { snapshot!(response["facetHits"].as_array().unwrap().len(), @"1"); } +#[actix_rt::test] +async fn simple_facet_search_with_max_values() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn non_filterable_facet_search_error() { let server = Server::new().await; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3e4849578..7bac5ea0c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -27,8 +27,8 @@ static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -/// The maximum number of facets returned by the facet search route. -const MAX_NUMBER_OF_FACETS: usize = 100; +/// The maximum number of values per facet returned by the facet search route. +const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; pub mod facet; mod fst_utils; @@ -306,6 +306,7 @@ pub struct SearchForFacetValues<'a> { query: Option, facet: String, search_query: Search<'a>, + max_values: usize, is_hybrid: bool, } @@ -315,7 +316,13 @@ impl<'a> SearchForFacetValues<'a> { search_query: Search<'a>, is_hybrid: bool, ) -> SearchForFacetValues<'a> { - SearchForFacetValues { query: None, facet, search_query, is_hybrid } + SearchForFacetValues { + query: None, + facet, + search_query, + max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, + is_hybrid, + } } pub fn query(&mut self, query: impl Into) -> &mut Self { @@ -323,6 +330,11 @@ impl<'a> SearchForFacetValues<'a> { self } + pub fn max_values(&mut self, max: usize) -> &mut Self { + self.max_values = max; + self + } + fn one_original_value_of( &self, field_id: FieldId, @@ -462,7 +474,7 @@ impl<'a> SearchForFacetValues<'a> { .unwrap_or_else(|| left_bound.to_string()); results.push(FacetValueHit { value, count }); } - if results.len() >= MAX_NUMBER_OF_FACETS { + if results.len() >= self.max_values { break; } } @@ -507,7 +519,7 @@ impl<'a> SearchForFacetValues<'a> { .unwrap_or_else(|| query.to_string()); results.push(FacetValueHit { value, count }); } - if results.len() >= MAX_NUMBER_OF_FACETS { + if results.len() >= self.max_values { return Ok(ControlFlow::Break(())); } } From 5f4fc6c95569b5b3c7aa81ffee3eea02d0100095 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 09:44:16 +0100 Subject: [PATCH 34/41] Add timer logs --- milli/src/search/new/bucket_sort.rs | 1 + milli/src/search/new/mod.rs | 2 ++ milli/src/search/new/query_term/parse_query.rs | 1 + 3 files changed, 4 insertions(+) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index b46f6124f..b439b87ec 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -15,6 +15,7 @@ pub struct BucketSortOutput { // TODO: would probably be good to regroup some of these inside of a struct? #[allow(clippy::too_many_arguments)] +#[logging_timer::time] pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 405b9747d..7b3b1d5b2 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -191,6 +191,7 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } +#[logging_timer::time] fn resolve_universe( ctx: &mut SearchContext, initial_universe: &RoaringBitmap, @@ -556,6 +557,7 @@ pub fn execute_vector_search( } #[allow(clippy::too_many_arguments)] +#[logging_timer::time] pub fn execute_search( ctx: &mut SearchContext, query: Option<&str>, diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 64fe07a31..865075d97 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -5,6 +5,7 @@ use super::*; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. +#[logging_timer::time] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, From 5f5a4868953e94f885654a6d2b8bfbd8afae3da0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 11:36:41 +0100 Subject: [PATCH 35/41] Reduce formatting time --- meilisearch/src/search.rs | 23 +++++++++++++++++++---- milli/src/search/new/matches/mod.rs | 6 +++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index b5dba8a58..7e783aa42 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -897,6 +897,14 @@ fn format_fields<'a>( let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); + // reduce the formated option list to the attributes that should be formatted, + // instead of all the attribute to display. + let formating_fields_options: Vec<_> = formatted_options + .iter() + .filter(|(_, option)| option.should_format()) + .map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option)) + .collect(); + // select the attributes to retrieve let displayable_names = displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name")); @@ -905,13 +913,15 @@ fn format_fields<'a>( // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only // highlighted. - let format = formatted_options + // Warn: The time to compute the `format` list scales with the number of field to format; + // cummulated with `map_leaf_values` that iterates over all the nested fields, it gives a quadratic complexity: + // `d*f` where `d` is the total number of field to display and `f` the total number of field to format. + let format = formating_fields_options .iter() - .filter(|(field, _option)| { - let name = field_ids_map.name(**field).unwrap(); + .filter(|(name, _option)| { milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) }) - .map(|(_, option)| *option) + .map(|(_, option)| **option) .reduce(|acc, option| acc.merge(option)); let mut infos = Vec::new(); @@ -941,6 +951,11 @@ fn format_value<'a>( infos: &mut Vec, compute_matches: bool, ) -> Value { + // early skip recursive function if nothing needs to be changed. + if !format_options.as_ref().map_or(false, FormatOptions::should_format) && !compute_matches { + return value; + } + match value { Value::String(old_string) => { let mut matcher = builder.build(&old_string); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 067fa1efd..8de1d9262 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -72,7 +72,7 @@ impl<'m> MatcherBuilder<'m> { } } -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, Debug)] pub struct FormatOptions { pub highlight: bool, pub crop: Option, @@ -82,6 +82,10 @@ impl FormatOptions { pub fn merge(self, other: Self) -> Self { Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } } + + pub fn should_format(&self) -> bool { + self.highlight || self.crop.is_some() + } } #[derive(Clone, Debug)] From 81b6128b29b612049c5579ca6fb9082d5f46a216 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 12:28:32 +0100 Subject: [PATCH 36/41] Update tests --- meilisearch/tests/settings/tokenizer_customization.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs index 4602e31f7..7f205eb30 100644 --- a/meilisearch/tests/settings/tokenizer_customization.rs +++ b/meilisearch/tests/settings/tokenizer_customization.rs @@ -94,7 +94,7 @@ async fn set_and_search() { "id": 1, "content": "Mac & cheese", "_formatted": { - "id": "1", + "id": 1, "content": "Mac & cheese" } }, @@ -102,7 +102,7 @@ async fn set_and_search() { "id": 3, "content": "Mac&sep&&sepcheese", "_formatted": { - "id": "3", + "id": 3, "content": "Mac&sep&&sepcheese" } } @@ -254,7 +254,7 @@ async fn advanced_synergies() { "id": 1, "content": "J.R.R. Tolkien", "_formatted": { - "id": "1", + "id": 1, "content": "J.R.R. Tolkien" } }, @@ -262,7 +262,7 @@ async fn advanced_synergies() { "id": 2, "content": "J. R. R. Tolkien", "_formatted": { - "id": "2", + "id": 2, "content": "J. R. R. Tolkien" } }, @@ -270,7 +270,7 @@ async fn advanced_synergies() { "id": 3, "content": "jrr Tolkien", "_formatted": { - "id": "3", + "id": 3, "content": "jrr Tolkien" } } From 86270e687837fcafc2730b017c29a01d3d2e6613 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 12:43:56 +0100 Subject: [PATCH 37/41] Transform fields contained into _format into strings --- meilisearch/src/search.rs | 7 +------ meilisearch/tests/settings/tokenizer_customization.rs | 10 +++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 7e783aa42..6893081d4 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -951,11 +951,6 @@ fn format_value<'a>( infos: &mut Vec, compute_matches: bool, ) -> Value { - // early skip recursive function if nothing needs to be changed. - if !format_options.as_ref().map_or(false, FormatOptions::should_format) && !compute_matches { - return value; - } - match value { Value::String(old_string) => { let mut matcher = builder.build(&old_string); @@ -1023,7 +1018,7 @@ fn format_value<'a>( let value = matcher.format(format_options); Value::String(value.into_owned()) } - None => Value::Number(number), + None => Value::String(s), } } value => value, diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs index 7f205eb30..4602e31f7 100644 --- a/meilisearch/tests/settings/tokenizer_customization.rs +++ b/meilisearch/tests/settings/tokenizer_customization.rs @@ -94,7 +94,7 @@ async fn set_and_search() { "id": 1, "content": "Mac & cheese", "_formatted": { - "id": 1, + "id": "1", "content": "Mac & cheese" } }, @@ -102,7 +102,7 @@ async fn set_and_search() { "id": 3, "content": "Mac&sep&&sepcheese", "_formatted": { - "id": 3, + "id": "3", "content": "Mac&sep&&sepcheese" } } @@ -254,7 +254,7 @@ async fn advanced_synergies() { "id": 1, "content": "J.R.R. Tolkien", "_formatted": { - "id": 1, + "id": "1", "content": "J.R.R. Tolkien" } }, @@ -262,7 +262,7 @@ async fn advanced_synergies() { "id": 2, "content": "J. R. R. Tolkien", "_formatted": { - "id": 2, + "id": "2", "content": "J. R. R. Tolkien" } }, @@ -270,7 +270,7 @@ async fn advanced_synergies() { "id": 3, "content": "jrr Tolkien", "_formatted": { - "id": 3, + "id": "3", "content": "jrr Tolkien" } } From b79b03d4e2b8cb3261bd2b5add1c75870c92756c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 13:14:51 +0100 Subject: [PATCH 38/41] Fix proximity precision telemetry --- meilisearch/src/routes/indexes/settings.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 3ab6afe07..decc4ffc9 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -458,7 +458,7 @@ make_setting_route!( json!({ "proximity_precision": { "set": precision.is_some(), - "value": precision, + "value": precision.unwrap_or_default(), } }), Some(req), @@ -690,7 +690,8 @@ pub async fn update_all( "set": new_settings.distinct_attribute.as_ref().set().is_some() }, "proximity_precision": { - "set": new_settings.proximity_precision.as_ref().set().is_some() + "set": new_settings.proximity_precision.as_ref().set().is_some(), + "value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default() }, "typo_tolerance": { "enabled": new_settings.typo_tolerance From 95f8e2153394f3089602b1ffa16b3548c520cc98 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 11 Jan 2024 15:07:08 +0100 Subject: [PATCH 39/41] fix typos --- meilisearch/src/search.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 6893081d4..28ca5a6e1 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -897,9 +897,9 @@ fn format_fields<'a>( let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); - // reduce the formated option list to the attributes that should be formatted, - // instead of all the attribute to display. - let formating_fields_options: Vec<_> = formatted_options + // reduce the formatted option list to the attributes that should be formatted, + // instead of all the attributes to display. + let formatting_fields_options: Vec<_> = formatted_options .iter() .filter(|(_, option)| option.should_format()) .map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option)) @@ -913,10 +913,10 @@ fn format_fields<'a>( // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only // highlighted. - // Warn: The time to compute the `format` list scales with the number of field to format; - // cummulated with `map_leaf_values` that iterates over all the nested fields, it gives a quadratic complexity: - // `d*f` where `d` is the total number of field to display and `f` the total number of field to format. - let format = formating_fields_options + // Warn: The time to compute the format list scales with the number of fields to format; + // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity: + // d*f where d is the total number of fields to display and f is the total number of fields to format. + let format = formatting_fields_options .iter() .filter(|(name, _option)| { milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) From 84a5c304fc78d85652897182cc66a85a454c0579 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 11 Jan 2024 21:35:06 +0100 Subject: [PATCH 40/41] Don't display the embedders setting when it is an empty dict --- meilisearch-types/src/settings.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 9613cb9f3..ca46abb0c 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -600,11 +600,12 @@ pub fn settings( ), }; - let embedders = index + let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() .map(|(name, config)| (name, Setting::Set(config.into()))) .collect(); + let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; Ok(Settings { displayed_attributes: match displayed_attributes { @@ -631,7 +632,7 @@ pub fn settings( typo_tolerance: Setting::Set(typo_tolerance), faceting: Setting::Set(faceting), pagination: Setting::Set(pagination), - embedders: Setting::Set(embedders), + embedders, _kind: PhantomData, }) } From 38abfec611142ae97e54932a9238b6941032ac0b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 11 Jan 2024 21:35:30 +0100 Subject: [PATCH 41/41] Fix tests --- meilisearch/tests/dumps/mod.rs | 39 ++++++++-------------- meilisearch/tests/settings/get_settings.rs | 3 +- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 90b9b91da..fd34268a5 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -77,8 +77,7 @@ async fn import_dump_v1_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -239,8 +238,7 @@ async fn import_dump_v1_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -387,8 +385,7 @@ async fn import_dump_v1_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -521,8 +518,7 @@ async fn import_dump_v2_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -667,8 +663,7 @@ async fn import_dump_v2_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -812,8 +807,7 @@ async fn import_dump_v2_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -946,8 +940,7 @@ async fn import_dump_v3_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1092,8 +1085,7 @@ async fn import_dump_v3_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1237,8 +1229,7 @@ async fn import_dump_v3_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1371,8 +1362,7 @@ async fn import_dump_v4_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1517,8 +1507,7 @@ async fn import_dump_v4_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1662,8 +1651,7 @@ async fn import_dump_v4_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1907,8 +1895,7 @@ async fn import_dump_v6_containing_experimental_features() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "###); diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 79689c1b5..5642e854f 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -54,7 +54,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 16); + assert_eq!(settings.keys().len(), 15); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -83,7 +83,6 @@ async fn get_settings() { "maxTotalHits": 1000, }) ); - assert_eq!(settings["embedders"], json!({})); assert_eq!(settings["proximityPrecision"], json!("byWord")); }