diff --git a/Cargo.lock b/Cargo.lock index d57d381ed..72701288f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -383,7 +383,7 @@ dependencies = [ [[package]] name = "arroy" version = "0.1.0" -source = "git+https://github.com/meilisearch/arroy.git#4f193fd534acd357b65bfe9eec4b3fed8ece2007" +source = "git+https://github.com/meilisearch/arroy.git#d372648212e561a4845077cdb9239423d78655a2" dependencies = [ "bytemuck", "byteorder", @@ -1592,9 +1592,6 @@ name = "esaxx-rs" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" -dependencies = [ - "cc", -] [[package]] name = "fancy-regex" @@ -1680,9 +1677,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.26" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -1704,9 +1701,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -2756,9 +2753,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -2777,6 +2774,7 @@ dependencies = [ "dump", "enum-iterator", "file-store", + "flate2", "insta", "log", "meili-snap", @@ -2792,6 +2790,7 @@ dependencies = [ "tempfile", "thiserror", "time", + "ureq", "uuid 1.5.0", ] @@ -3562,6 +3561,7 @@ dependencies = [ "tokio", "tokio-stream", "toml", + "url", "urlencoding", "uuid 1.5.0", "vergen", @@ -4070,9 +4070,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" @@ -5310,11 +5310,9 @@ version = "0.14.1" source = "git+https://github.com/huggingface/tokenizers.git?tag=v0.14.1#6357206cdcce4d78ffb1e0372feb456caea09375" dependencies = [ "aho-corasick", - "clap", "derive_builder", "esaxx-rs", "getrandom", - "indicatif", "itertools 0.11.0", "lazy_static", "log", @@ -5602,13 +5600,14 @@ dependencies = [ [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index c4a37b7d6..a8c19f435 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -18,6 +18,7 @@ derive_builder = "0.12.0" dump = { path = "../dump" } enum-iterator = "1.4.0" file-store = { path = "../file-store" } +flate2 = "1.0.28" log = "0.4.17" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } @@ -30,6 +31,7 @@ synchronoise = "1.0.1" tempfile = "3.5.0" thiserror = "1.0.40" time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] } +ureq = "2.9.1" uuid = { version = "1.3.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index c1c7af7ac..97dc7a2bd 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -936,8 +936,8 @@ impl IndexScheduler { }; // the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick - *self.currently_updating_index.write().unwrap() = - Some((index_uid.clone(), index.clone())); + self.index_mapper + .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; @@ -1351,9 +1351,6 @@ impl IndexScheduler { for (task, (_, settings)) in tasks.iter_mut().zip(settings) { let checked_settings = settings.clone().check(); - if matches!(checked_settings.embedders, milli::update::Setting::Set(_)) { - self.features().check_vector("Passing `embedders` in settings")? - } task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) }); apply_settings_to_builder(&checked_settings, &mut builder); diff --git a/index-scheduler/src/index_mapper/mod.rs b/index-scheduler/src/index_mapper/mod.rs index 18aed42b0..58ec2bf11 100644 --- a/index-scheduler/src/index_mapper/mod.rs +++ b/index-scheduler/src/index_mapper/mod.rs @@ -69,6 +69,10 @@ pub struct IndexMapper { /// Whether we open a meilisearch index with the MDB_WRITEMAP option or not. enable_mdb_writemap: bool, pub indexer_config: Arc, + + /// A few types of long running batches of tasks that act on a single index set this field + /// so that a handle to the index is available from other threads (search) in an optimized manner. + currently_updating_index: Arc>>, } /// Whether the index is available for use or is forbidden to be inserted back in the index map @@ -151,6 +155,7 @@ impl IndexMapper { index_growth_amount, enable_mdb_writemap, indexer_config: Arc::new(indexer_config), + currently_updating_index: Default::default(), }) } @@ -303,6 +308,14 @@ impl IndexMapper { /// Return an index, may open it if it wasn't already opened. pub fn index(&self, rtxn: &RoTxn, name: &str) -> Result { + if let Some((current_name, current_index)) = + self.currently_updating_index.read().unwrap().as_ref() + { + if current_name == name { + return Ok(current_index.clone()); + } + } + let uuid = self .index_mapping .get(rtxn, name)? @@ -474,4 +487,8 @@ impl IndexMapper { pub fn indexer_config(&self) -> &IndexerConfig { &self.indexer_config } + + pub fn set_currently_updating_index(&self, index: Option<(String, Index)>) { + *self.currently_updating_index.write().unwrap() = index; + } } diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index ddb9e934a..42f041578 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -37,10 +37,11 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { snapshots_path: _, auth_path: _, version_file_path: _, + webhook_url: _, + webhook_authorization_header: _, test_breakpoint_sdr: _, planned_failures: _, run_loop_iteration: _, - currently_updating_index: _, embedders: _, } = scheduler; diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b9b360fa4..a5b0cb5b0 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -34,6 +34,7 @@ pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; use std::fs::File; +use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicBool; @@ -45,6 +46,8 @@ use dump::{KindDump, TaskDump, UpdateFile}; pub use error::Error; pub use features::RoFeatures; use file_store::FileStore; +use flate2::bufread::GzEncoder; +use flate2::Compression; use meilisearch_types::error::ResponseError; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; @@ -54,6 +57,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; +use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; use roaring::RoaringBitmap; @@ -170,8 +174,8 @@ impl ProcessingTasks { } /// Set the processing tasks to an empty list - fn stop_processing(&mut self) { - self.processing = RoaringBitmap::new(); + fn stop_processing(&mut self) -> RoaringBitmap { + std::mem::take(&mut self.processing) } /// Returns `true` if there, at least, is one task that is currently processing that we must stop. @@ -241,6 +245,10 @@ pub struct IndexSchedulerOptions { pub snapshots_path: PathBuf, /// The path to the folder containing the dumps. pub dumps_path: PathBuf, + /// The URL on which we must send the tasks statuses + pub webhook_url: Option, + /// The value we will send into the Authorization HTTP header on the webhook URL + pub webhook_authorization_header: Option, /// The maximum size, in bytes, of the task index. pub task_db_size: usize, /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. @@ -323,6 +331,11 @@ pub struct IndexScheduler { /// The maximum number of tasks that will be batched together. pub(crate) max_number_of_batched_tasks: usize, + /// The webhook url we should send tasks to after processing every batches. + pub(crate) webhook_url: Option, + /// The Authorization header to send to the webhook URL. + pub(crate) webhook_authorization_header: Option, + /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -338,10 +351,6 @@ pub struct IndexScheduler { /// The path to the version file of Meilisearch. pub(crate) version_file_path: PathBuf, - /// A few types of long running batches of tasks that act on a single index set this field - /// so that a handle to the index is available from other threads (search) in an optimized manner. - currently_updating_index: Arc>>, - embedders: Arc>>>, // ================= test @@ -388,7 +397,8 @@ impl IndexScheduler { dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), version_file_path: self.version_file_path.clone(), - currently_updating_index: self.currently_updating_index.clone(), + webhook_url: self.webhook_url.clone(), + webhook_authorization_header: self.webhook_authorization_header.clone(), embedders: self.embedders.clone(), #[cfg(test)] test_breakpoint_sdr: self.test_breakpoint_sdr.clone(), @@ -487,7 +497,8 @@ impl IndexScheduler { snapshots_path: options.snapshots_path, auth_path: options.auth_path, version_file_path: options.version_file_path, - currently_updating_index: Arc::new(RwLock::new(None)), + webhook_url: options.webhook_url, + webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), #[cfg(test)] @@ -671,13 +682,6 @@ impl IndexScheduler { /// If you need to fetch information from or perform an action on all indexes, /// see the `try_for_each_index` function. pub fn index(&self, name: &str) -> Result { - if let Some((current_name, current_index)) = - self.currently_updating_index.read().unwrap().as_ref() - { - if current_name == name { - return Ok(current_index.clone()); - } - } let rtxn = self.env.read_txn()?; self.index_mapper.index(&rtxn, name) } @@ -1158,7 +1162,7 @@ impl IndexScheduler { }; // Reset the currently updating index to relinquish the index handle - *self.currently_updating_index.write().unwrap() = None; + self.index_mapper.set_currently_updating_index(None); #[cfg(test)] self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?; @@ -1251,19 +1255,99 @@ impl IndexScheduler { } } - self.processing_tasks.write().unwrap().stop_processing(); + let processed = self.processing_tasks.write().unwrap().stop_processing(); #[cfg(test)] self.maybe_fail(tests::FailureLocation::CommittingWtxn)?; wtxn.commit().map_err(Error::HeedTransaction)?; + // We shouldn't crash the tick function if we can't send data to the webhook. + let _ = self.notify_webhook(&processed); + #[cfg(test)] self.breakpoint(Breakpoint::AfterProcessing); Ok(TickOutcome::TickAgain(processed_tasks)) } + /// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one. + fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> { + if let Some(ref url) = self.webhook_url { + struct TaskReader<'a, 'b> { + rtxn: &'a RoTxn<'a>, + index_scheduler: &'a IndexScheduler, + tasks: &'b mut roaring::bitmap::Iter<'b>, + buffer: Vec, + written: usize, + } + + impl<'a, 'b> Read for TaskReader<'a, 'b> { + fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { + if self.buffer.is_empty() { + match self.tasks.next() { + None => return Ok(0), + Some(task_id) => { + let task = self + .index_scheduler + .get_task(self.rtxn, task_id) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + Error::CorruptedTaskQueue, + ) + })?; + + serde_json::to_writer( + &mut self.buffer, + &TaskView::from_task(&task), + )?; + self.buffer.push(b'\n'); + } + } + } + + let mut to_write = &self.buffer[self.written..]; + let wrote = io::copy(&mut to_write, &mut buf)?; + self.written += wrote as usize; + + // we wrote everything and must refresh our buffer on the next call + if self.written == self.buffer.len() { + self.written = 0; + self.buffer.clear(); + } + + Ok(wrote as usize) + } + } + + let rtxn = self.env.read_txn()?; + + let task_reader = TaskReader { + rtxn: &rtxn, + index_scheduler: self, + tasks: &mut updated.into_iter(), + buffer: Vec::with_capacity(50), // on average a task is around ~100 bytes + written: 0, + }; + + // let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); + let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); + let request = ureq::post(url).set("Content-Encoding", "gzip"); + let request = match &self.webhook_authorization_header { + Some(header) => request.set("Authorization", header), + None => request, + }; + + if let Err(e) = request.send(reader) { + log::error!("While sending data to the webhook: {e}"); + } + } + + Ok(()) + } + /// Register a task to cleanup the task queue if needed fn cleanup_task_queue(&self) -> Result<()> { let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; @@ -1677,6 +1761,8 @@ mod tests { indexes_path: tempdir.path().join("indexes"), snapshots_path: tempdir.path().join("snapshots"), dumps_path: tempdir.path().join("dumps"), + webhook_url: None, + webhook_authorization_header: None, task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. enable_mdb_writemap: false, diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 62591e991..2182b1836 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -344,7 +344,10 @@ impl ErrorCode for milli::Error { Code::InvalidDocumentId } UserError::MissingDocumentField(_) => Code::InvalidDocumentFields, - UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, + UserError::InvalidFieldForSource { .. } + | UserError::MissingFieldForSource { .. } + | UserError::InvalidOpenAiModel { .. } + | UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index b0762563a..e4f5cbeb4 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -9,6 +9,7 @@ pub mod index_uid_pattern; pub mod keys; pub mod settings; pub mod star_or; +pub mod task_view; pub mod tasks; pub mod versioning; pub use milli::{heed, Index}; diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index b0dee69a3..ca46abb0c 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -318,6 +318,21 @@ impl Settings { _kind: PhantomData, } } + + pub fn validate(self) -> Result { + self.validate_embedding_settings() + } + + fn validate_embedding_settings(mut self) -> Result { + let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; + for (name, config) in configs.iter_mut() { + let config_to_check = std::mem::take(config); + let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?; + *config = checked_config + } + self.embedders = Setting::Set(configs); + Ok(self) + } } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -585,11 +600,12 @@ pub fn settings( ), }; - let embedders = index + let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() .map(|(name, config)| (name, Setting::Set(config.into()))) .collect(); + let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; Ok(Settings { displayed_attributes: match displayed_attributes { @@ -611,15 +627,12 @@ pub fn settings( Some(field) => Setting::Set(field), None => Setting::Reset, }, - proximity_precision: match proximity_precision { - Some(precision) => Setting::Set(precision), - None => Setting::Reset, - }, + proximity_precision: Setting::Set(proximity_precision.unwrap_or_default()), synonyms: Setting::Set(synonyms), typo_tolerance: Setting::Set(typo_tolerance), faceting: Setting::Set(faceting), pagination: Setting::Set(pagination), - embedders: Setting::Set(embedders), + embedders, _kind: PhantomData, }) } @@ -720,10 +733,11 @@ impl From for Criterion { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub enum ProximityPrecisionView { + #[default] ByWord, ByAttribute, } diff --git a/meilisearch-types/src/task_view.rs b/meilisearch-types/src/task_view.rs new file mode 100644 index 000000000..02be91a88 --- /dev/null +++ b/meilisearch-types/src/task_view.rs @@ -0,0 +1,139 @@ +use serde::Serialize; +use time::{Duration, OffsetDateTime}; + +use crate::error::ResponseError; +use crate::settings::{Settings, Unchecked}; +use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct TaskView { + pub uid: TaskId, + #[serde(default)] + pub index_uid: Option, + pub status: Status, + #[serde(rename = "type")] + pub kind: Kind, + pub canceled_by: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option, + pub error: Option, + #[serde(serialize_with = "serialize_duration", default)] + pub duration: Option, + #[serde(with = "time::serde::rfc3339")] + pub enqueued_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339::option", default)] + pub started_at: Option, + #[serde(with = "time::serde::rfc3339::option", default)] + pub finished_at: Option, +} + +impl TaskView { + pub fn from_task(task: &Task) -> TaskView { + TaskView { + uid: task.uid, + index_uid: task.index_uid().map(ToOwned::to_owned), + status: task.status, + kind: task.kind.as_kind(), + canceled_by: task.canceled_by, + details: task.details.clone().map(DetailsView::from), + error: task.error.clone(), + duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), + enqueued_at: task.enqueued_at, + started_at: task.started_at, + finished_at: task.finished_at, + } + } +} + +#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DetailsView { + #[serde(skip_serializing_if = "Option::is_none")] + pub received_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub indexed_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub primary_key: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub provided_ids: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub matched_tasks: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub canceled_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_tasks: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub original_filter: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub dump_uid: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(flatten)] + pub settings: Option>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub swaps: Option>, +} + +impl From
for DetailsView { + fn from(details: Details) -> Self { + match details { + Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { + DetailsView { + received_documents: Some(received_documents), + indexed_documents: Some(indexed_documents), + ..DetailsView::default() + } + } + Details::SettingsUpdate { settings } => { + DetailsView { settings: Some(settings), ..DetailsView::default() } + } + Details::IndexInfo { primary_key } => { + DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } + } + Details::DocumentDeletion { + provided_ids: received_document_ids, + deleted_documents, + } => DetailsView { + provided_ids: Some(received_document_ids), + deleted_documents: Some(deleted_documents), + original_filter: Some(None), + ..DetailsView::default() + }, + Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { + DetailsView { + provided_ids: Some(0), + original_filter: Some(Some(original_filter)), + deleted_documents: Some(deleted_documents), + ..DetailsView::default() + } + } + Details::ClearAll { deleted_documents } => { + DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } + } + Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + canceled_tasks: Some(canceled_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { + DetailsView { + matched_tasks: Some(matched_tasks), + deleted_tasks: Some(deleted_tasks), + original_filter: Some(Some(original_filter)), + ..DetailsView::default() + } + } + Details::Dump { dump_uid } => { + DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } + } + Details::IndexSwap { swaps } => { + DetailsView { swaps: Some(swaps), ..Default::default() } + } + } + } +} diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index c59b38fa6..5d2423b95 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,6 +104,7 @@ walkdir = "2.3.3" yaup = "0.2.1" serde_urlencoded = "0.7.1" termcolor = "1.2.0" +url = { version = "2.5.0", features = ["serde"] } [dev-dependencies] actix-rt = "2.8.0" @@ -153,5 +154,5 @@ greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip" -sha1 = "83cd44ed1e5f97ecb581dc9f958a63f4ccc982d9" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.12/build.zip" +sha1 = "acfe9a018c93eb0604ea87ee87bff7df5474e18e" diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 1ad277c28..86a5eddb9 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -264,6 +264,8 @@ struct Infos { ignore_snapshot_if_db_exists: bool, http_addr: bool, http_payload_size_limit: Byte, + task_queue_webhook: bool, + task_webhook_authorization_header: bool, log_level: String, max_indexing_memory: MaxMemory, max_indexing_threads: MaxThreads, @@ -290,6 +292,8 @@ impl From for Infos { http_addr, master_key: _, env, + task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -343,6 +347,8 @@ impl From for Infos { http_addr: http_addr != default_http_addr(), http_payload_size_limit, experimental_max_number_of_batched_tasks, + task_queue_webhook: task_webhook_url.is_some(), + task_webhook_authorization_header: task_webhook_authorization_header.is_some(), log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index e0f488eea..f1111962c 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -228,6 +228,8 @@ fn open_or_create_database_unchecked( indexes_path: opt.db_path.join("indexes"), snapshots_path: opt.snapshot_dir.clone(), dumps_path: opt.dump_dir.clone(), + webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), + webhook_authorization_header: opt.task_webhook_authorization_header.clone(), task_db_size: opt.max_task_db_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize, enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 1ed20f5b5..a0672c9cf 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -21,6 +21,7 @@ use rustls::RootCertStore; use rustls_pemfile::{certs, pkcs8_private_keys, rsa_private_keys}; use serde::{Deserialize, Serialize}; use sysinfo::{RefreshKind, System, SystemExt}; +use url::Url; const POSSIBLE_ENV: [&str; 2] = ["development", "production"]; @@ -28,6 +29,8 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH"; const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR"; const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; +const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; +const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; #[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; @@ -156,6 +159,14 @@ pub struct Opt { #[serde(default = "default_env")] pub env: String, + /// Called whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] + pub task_webhook_url: Option, + + /// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified. + #[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)] + pub task_webhook_authorization_header: Option, + /// Deactivates Meilisearch's built-in telemetry when provided. /// /// Meilisearch automatically collects data from all instances that do not opt out using this flag. @@ -375,6 +386,8 @@ impl Opt { http_addr, master_key, env, + task_webhook_url, + task_webhook_authorization_header, max_index_size: _, max_task_db_size: _, http_payload_size_limit, @@ -409,6 +422,16 @@ impl Opt { export_to_env_if_not_present(MEILI_MASTER_KEY, master_key); } export_to_env_if_not_present(MEILI_ENV, env); + if let Some(task_webhook_url) = task_webhook_url { + export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string()); + } + if let Some(task_webhook_authorization_header) = task_webhook_authorization_header { + export_to_env_if_not_present( + MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER, + task_webhook_authorization_header, + ); + } + #[cfg(feature = "analytics")] { export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 290cab2e0..decc4ffc9 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -90,6 +90,11 @@ macro_rules! make_setting_route { ..Default::default() }; + let new_settings = $crate::routes::indexes::settings::validate_settings( + new_settings, + &index_scheduler, + )?; + let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -453,7 +458,7 @@ make_setting_route!( json!({ "proximity_precision": { "set": precision.is_some(), - "value": precision, + "value": precision.unwrap_or_default(), } }), Some(req), @@ -582,13 +587,13 @@ fn embedder_analytics( for source in s .values() .filter_map(|config| config.clone().set()) - .filter_map(|config| config.embedder_options.set()) + .filter_map(|config| config.source.set()) { - use meilisearch_types::milli::vector::settings::EmbedderSettings; + use meilisearch_types::milli::vector::settings::EmbedderSource; match source { - EmbedderSettings::OpenAi(_) => sources.insert("openAi"), - EmbedderSettings::HuggingFace(_) => sources.insert("huggingFace"), - EmbedderSettings::UserProvided(_) => sources.insert("userProvided"), + EmbedderSource::OpenAi => sources.insert("openAi"), + EmbedderSource::HuggingFace => sources.insert("huggingFace"), + EmbedderSource::UserProvided => sources.insert("userProvided"), }; } }; @@ -651,6 +656,7 @@ pub async fn update_all( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let new_settings = body.into_inner(); + let new_settings = validate_settings(new_settings, &index_scheduler)?; analytics.publish( "Settings Updated".to_string(), @@ -684,7 +690,8 @@ pub async fn update_all( "set": new_settings.distinct_attribute.as_ref().set().is_some() }, "proximity_precision": { - "set": new_settings.proximity_precision.as_ref().set().is_some() + "set": new_settings.proximity_precision.as_ref().set().is_some(), + "value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default() }, "typo_tolerance": { "enabled": new_settings.typo_tolerance @@ -800,3 +807,13 @@ pub async fn delete_all( debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } + +fn validate_settings( + settings: Settings, + index_scheduler: &IndexScheduler, +) -> Result, ResponseError> { + if matches!(settings.embedders, Setting::Set(_)) { + index_scheduler.features().check_vector("Passing `embedders` in settings")? + } + Ok(settings.validate()?) +} diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index f7d4c44d7..03b63001d 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -8,11 +8,9 @@ use meilisearch_types::deserr::DeserrQueryParamError; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{InvalidTaskDateError, ResponseError}; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; -use meilisearch_types::tasks::{ - serialize_duration, Details, IndexSwap, Kind, KindWithContent, Status, Task, -}; +use meilisearch_types::task_view::TaskView; +use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; use serde_json::json; use time::format_description::well_known::Rfc3339; @@ -37,140 +35,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks)))) .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); } - -#[derive(Debug, Clone, PartialEq, Eq, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskView { - pub uid: TaskId, - #[serde(default)] - pub index_uid: Option, - pub status: Status, - #[serde(rename = "type")] - pub kind: Kind, - pub canceled_by: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option, - pub error: Option, - #[serde(serialize_with = "serialize_duration", default)] - pub duration: Option, - #[serde(with = "time::serde::rfc3339")] - pub enqueued_at: OffsetDateTime, - #[serde(with = "time::serde::rfc3339::option", default)] - pub started_at: Option, - #[serde(with = "time::serde::rfc3339::option", default)] - pub finished_at: Option, -} - -impl TaskView { - pub fn from_task(task: &Task) -> TaskView { - TaskView { - uid: task.uid, - index_uid: task.index_uid().map(ToOwned::to_owned), - status: task.status, - kind: task.kind.as_kind(), - canceled_by: task.canceled_by, - details: task.details.clone().map(DetailsView::from), - error: task.error.clone(), - duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start), - enqueued_at: task.enqueued_at, - started_at: task.started_at, - finished_at: task.finished_at, - } - } -} - -#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct DetailsView { - #[serde(skip_serializing_if = "Option::is_none")] - pub received_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub indexed_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub primary_key: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub provided_ids: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub matched_tasks: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub canceled_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub deleted_tasks: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub original_filter: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub dump_uid: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(flatten)] - pub settings: Option>>, - #[serde(skip_serializing_if = "Option::is_none")] - pub swaps: Option>, -} - -impl From
for DetailsView { - fn from(details: Details) -> Self { - match details { - Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => { - DetailsView { - received_documents: Some(received_documents), - indexed_documents: Some(indexed_documents), - ..DetailsView::default() - } - } - Details::SettingsUpdate { settings } => { - DetailsView { settings: Some(settings), ..DetailsView::default() } - } - Details::IndexInfo { primary_key } => { - DetailsView { primary_key: Some(primary_key), ..DetailsView::default() } - } - Details::DocumentDeletion { - provided_ids: received_document_ids, - deleted_documents, - } => DetailsView { - provided_ids: Some(received_document_ids), - deleted_documents: Some(deleted_documents), - original_filter: Some(None), - ..DetailsView::default() - }, - Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { - DetailsView { - provided_ids: Some(0), - original_filter: Some(Some(original_filter)), - deleted_documents: Some(deleted_documents), - ..DetailsView::default() - } - } - Details::ClearAll { deleted_documents } => { - DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() } - } - Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - canceled_tasks: Some(canceled_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => { - DetailsView { - matched_tasks: Some(matched_tasks), - deleted_tasks: Some(deleted_tasks), - original_filter: Some(Some(original_filter)), - ..DetailsView::default() - } - } - Details::Dump { dump_uid } => { - DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() } - } - Details::IndexSwap { swaps } => { - DetailsView { swaps: Some(swaps), ..Default::default() } - } - } - } -} - #[derive(Debug, Deserr)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] pub struct TasksFilterQuery { diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index b5dba8a58..551f89216 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -735,6 +735,9 @@ pub fn perform_facet_search( if let Some(facet_query) = &facet_query { facet_search.query(facet_query); } + if let Some(max_facets) = index.max_values_per_facet(&rtxn)? { + facet_search.max_values(max_facets as usize); + } Ok(FacetSearchResult { facet_hits: facet_search.execute()?, @@ -897,6 +900,14 @@ fn format_fields<'a>( let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); + // reduce the formatted option list to the attributes that should be formatted, + // instead of all the attributes to display. + let formatting_fields_options: Vec<_> = formatted_options + .iter() + .filter(|(_, option)| option.should_format()) + .map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option)) + .collect(); + // select the attributes to retrieve let displayable_names = displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name")); @@ -905,13 +916,15 @@ fn format_fields<'a>( // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only // highlighted. - let format = formatted_options + // Warn: The time to compute the format list scales with the number of fields to format; + // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity: + // d*f where d is the total number of fields to display and f is the total number of fields to format. + let format = formatting_fields_options .iter() - .filter(|(field, _option)| { - let name = field_ids_map.name(**field).unwrap(); + .filter(|(name, _option)| { milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) }) - .map(|(_, option)| *option) + .map(|(_, option)| **option) .reduce(|acc, option| acc.merge(option)); let mut infos = Vec::new(); @@ -1008,7 +1021,7 @@ fn format_value<'a>( let value = matcher.format(format_options); Value::String(value.into_owned()) } - None => Value::Number(number), + None => Value::String(s), } } value => value, diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 5ad0ed247..fd34268a5 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -59,7 +59,7 @@ async fn import_dump_v1_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -77,8 +77,7 @@ async fn import_dump_v1_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -221,7 +220,7 @@ async fn import_dump_v1_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -239,8 +238,7 @@ async fn import_dump_v1_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -369,7 +367,7 @@ async fn import_dump_v1_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -387,8 +385,7 @@ async fn import_dump_v1_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -503,7 +500,7 @@ async fn import_dump_v2_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -521,8 +518,7 @@ async fn import_dump_v2_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -649,7 +645,7 @@ async fn import_dump_v2_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -667,8 +663,7 @@ async fn import_dump_v2_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -794,7 +789,7 @@ async fn import_dump_v2_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -812,8 +807,7 @@ async fn import_dump_v2_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -928,7 +922,7 @@ async fn import_dump_v3_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -946,8 +940,7 @@ async fn import_dump_v3_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1074,7 +1067,7 @@ async fn import_dump_v3_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1092,8 +1085,7 @@ async fn import_dump_v3_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1219,7 +1211,7 @@ async fn import_dump_v3_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1237,8 +1229,7 @@ async fn import_dump_v3_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1353,7 +1344,7 @@ async fn import_dump_v4_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1371,8 +1362,7 @@ async fn import_dump_v4_movie_raw() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1499,7 +1489,7 @@ async fn import_dump_v4_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1517,8 +1507,7 @@ async fn import_dump_v4_movie_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1644,7 +1633,7 @@ async fn import_dump_v4_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, - "proximityPrecision": null, + "proximityPrecision": "byWord", "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1662,8 +1651,7 @@ async fn import_dump_v4_rubygems_with_settings() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "### ); @@ -1907,8 +1895,7 @@ async fn import_dump_v6_containing_experimental_features() { }, "pagination": { "maxTotalHits": 1000 - }, - "embedders": {} + } } "###); diff --git a/meilisearch/tests/search/facet_search.rs b/meilisearch/tests/search/facet_search.rs index 8c1229f1a..5f9f631f9 100644 --- a/meilisearch/tests/search/facet_search.rs +++ b/meilisearch/tests/search/facet_search.rs @@ -105,6 +105,24 @@ async fn more_advanced_facet_search() { snapshot!(response["facetHits"].as_array().unwrap().len(), @"1"); } +#[actix_rt::test] +async fn simple_facet_search_with_max_values() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn non_filterable_facet_search_error() { let server = Server::new().await; diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index fb6fe297f..79819cab2 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -21,9 +21,9 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde "###); let (response, code) = index - .update_settings( - json!({ "embedders": {"default": {"source": {"userProvided": {"dimensions": 2}}}} }), - ) + .update_settings(json!({ "embedders": {"default": { + "source": "userProvided", + "dimensions": 2}}} )) .await; assert_eq!(202, code, "{:?}", response); index.wait_task(response.uid()).await; @@ -56,6 +56,15 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { }]) }); +static SINGLE_DOCUMENT: Lazy = Lazy::new(|| { + json!([{ + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": {"default": [1.0, 3.0]}, + }]) +}); + #[actix_rt::test] async fn simple_search() { let server = Server::new().await; @@ -149,3 +158,18 @@ async fn invalid_semantic_ratio() { } "###); } + +#[actix_rt::test] +async fn single_document() { + let server = Server::new().await; + let index = index_with_documents(&server, &SINGLE_DOCUMENT).await; + + let (response, code) = index + .search_post( + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + ) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 133a143fd..9b7b01029 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -890,13 +890,21 @@ async fn experimental_feature_vector_store() { let (response, code) = index .update_settings(json!({"embedders": { "manual": { - "source": { - "userProvided": {"dimensions": 3} - } + "source": "userProvided", + "dimensions": 3, } }})) .await; + meili_snap::snapshot!(response, @r###" + { + "taskUid": 1, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); meili_snap::snapshot!(code, @"202 Accepted"); let response = index.wait_task(response.uid()).await; diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 9ab53c51e..5642e854f 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -54,7 +54,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 16); + assert_eq!(settings.keys().len(), 15); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -83,7 +83,7 @@ async fn get_settings() { "maxTotalHits": 1000, }) ); - assert_eq!(settings["embedders"], json!({})); + assert_eq!(settings["proximityPrecision"], json!("byWord")); } #[actix_rt::test] diff --git a/meilisearch/tests/tasks/mod.rs b/meilisearch/tests/tasks/mod.rs index 7a5fa6388..ed387224e 100644 --- a/meilisearch/tests/tasks/mod.rs +++ b/meilisearch/tests/tasks/mod.rs @@ -1,4 +1,5 @@ mod errors; +mod webhook; use meili_snap::insta::assert_json_snapshot; use time::format_description::well_known::Rfc3339; diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs new file mode 100644 index 000000000..6979ff294 --- /dev/null +++ b/meilisearch/tests/tasks/webhook.rs @@ -0,0 +1,123 @@ +//! To test the webhook, we need to spawn a new server with a URL listening for +//! post requests. The webhook handle starts a server and forwards all the +//! received requests into a channel for you to handle. + +use std::sync::Arc; + +use actix_http::body::MessageBody; +use actix_web::dev::{ServiceFactory, ServiceResponse}; +use actix_web::web::{Bytes, Data}; +use actix_web::{post, App, HttpResponse, HttpServer}; +use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; +use tokio::sync::mpsc; +use url::Url; + +use crate::common::{default_settings, Server}; +use crate::json; + +#[post("/")] +async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { + let body = body.to_vec(); + sender.send(body).unwrap(); + HttpResponse::Ok().into() +} + +fn create_app( + sender: Arc>>, +) -> actix_web::App< + impl ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Response = ServiceResponse, + Error = actix_web::Error, + InitError = (), + >, +> { + App::new().service(forward_body).app_data(Data::from(sender)) +} + +struct WebhookHandle { + pub server_handle: tokio::task::JoinHandle>, + pub url: String, + pub receiver: mpsc::UnboundedReceiver>, +} + +async fn create_webhook_server() -> WebhookHandle { + let mut log_builder = env_logger::Builder::new(); + log_builder.parse_filters("info"); + log_builder.init(); + + let (sender, receiver) = mpsc::unbounded_channel(); + let sender = Arc::new(sender); + + // By listening on the port 0, the system will give us any available port. + let server = + HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap(); + let (ip, scheme) = server.addrs_with_scheme()[0]; + let url = format!("{scheme}://{ip}/"); + + let server_handle = tokio::spawn(server.run()); + WebhookHandle { server_handle, url, receiver } +} + +#[actix_web::test] +async fn test_basic_webhook() { + let WebhookHandle { server_handle, url, mut receiver } = create_webhook_server().await; + + let db_path = tempfile::tempdir().unwrap(); + let server = Server::new_with_options(Opt { + task_webhook_url: Some(Url::parse(&url).unwrap()), + ..default_settings(db_path.path()) + }) + .await + .unwrap(); + + let index = server.index("tamo"); + // May be flaky: we're relying on the fact that while the first document addition is processed, the other + // operations will be received and will be batched together. If it doesn't happen it's not a problem + // the rest of the test won't assume anything about the number of tasks per batch. + for i in 0..5 { + let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await; + } + + let mut nb_tasks = 0; + while let Some(payload) = receiver.recv().await { + let payload = String::from_utf8(payload).unwrap(); + let jsonl = payload.split('\n'); + for json in jsonl { + if json.is_empty() { + break; // we reached EOF + } + nb_tasks += 1; + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + snapshot!( + json_string!(json, { ".uid" => "[uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": "[uid]", + "indexUid": "tamo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + } + if nb_tasks == 5 { + break; + } + } + + assert!(nb_tasks == 5, "We should have received the 5 tasks but only received {nb_tasks}"); + + server_handle.abort(); +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b977d64f1..3e10c175a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -77,7 +77,7 @@ csv = "1.2.1" candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1" } +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ "online", ] } diff --git a/milli/src/error.rs b/milli/src/error.rs index 9c5d8f416..539861e73 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -192,7 +192,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentField(#[from] crate::prompt::error::RenderPromptError), #[error(transparent)] InvalidPrompt(#[from] crate::prompt::error::NewPromptError), - #[error("Invalid prompt in for embeddings with name '{0}': {1}.")] + #[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")] InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), @@ -200,6 +200,33 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidEmbedder(String), #[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")] TooManyVectors(String, usize), + #[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}", + allowed_sources_for_field + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", "), + allowed_fields_for_source + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", ") + )] + InvalidFieldForSource { + embedder_name: String, + source_: crate::vector::settings::EmbedderSource, + field: &'static str, + allowed_fields_for_source: &'static [&'static str], + allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource], + }, + #[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())] + InvalidOpenAiModel { embedder_name: String, model: String }, + #[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")] + MissingFieldForSource { + field: &'static str, + source_: crate::vector::settings::EmbedderSource, + embedder_name: String, + }, } impl From for Error { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3e4849578..7bac5ea0c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -27,8 +27,8 @@ static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -/// The maximum number of facets returned by the facet search route. -const MAX_NUMBER_OF_FACETS: usize = 100; +/// The maximum number of values per facet returned by the facet search route. +const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; pub mod facet; mod fst_utils; @@ -306,6 +306,7 @@ pub struct SearchForFacetValues<'a> { query: Option, facet: String, search_query: Search<'a>, + max_values: usize, is_hybrid: bool, } @@ -315,7 +316,13 @@ impl<'a> SearchForFacetValues<'a> { search_query: Search<'a>, is_hybrid: bool, ) -> SearchForFacetValues<'a> { - SearchForFacetValues { query: None, facet, search_query, is_hybrid } + SearchForFacetValues { + query: None, + facet, + search_query, + max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, + is_hybrid, + } } pub fn query(&mut self, query: impl Into) -> &mut Self { @@ -323,6 +330,11 @@ impl<'a> SearchForFacetValues<'a> { self } + pub fn max_values(&mut self, max: usize) -> &mut Self { + self.max_values = max; + self + } + fn one_original_value_of( &self, field_id: FieldId, @@ -462,7 +474,7 @@ impl<'a> SearchForFacetValues<'a> { .unwrap_or_else(|| left_bound.to_string()); results.push(FacetValueHit { value, count }); } - if results.len() >= MAX_NUMBER_OF_FACETS { + if results.len() >= self.max_values { break; } } @@ -507,7 +519,7 @@ impl<'a> SearchForFacetValues<'a> { .unwrap_or_else(|| query.to_string()); results.push(FacetValueHit { value, count }); } - if results.len() >= MAX_NUMBER_OF_FACETS { + if results.len() >= self.max_values { return Ok(ControlFlow::Break(())); } } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index b46f6124f..b439b87ec 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -15,6 +15,7 @@ pub struct BucketSortOutput { // TODO: would probably be good to regroup some of these inside of a struct? #[allow(clippy::too_many_arguments)] +#[logging_timer::time] pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 067fa1efd..8de1d9262 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -72,7 +72,7 @@ impl<'m> MatcherBuilder<'m> { } } -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, Debug)] pub struct FormatOptions { pub highlight: bool, pub crop: Option, @@ -82,6 +82,10 @@ impl FormatOptions { pub fn merge(self, other: Self) -> Self { Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } } + + pub fn should_format(&self) -> bool { + self.highlight || self.crop.is_some() + } } #[derive(Clone, Debug)] diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 405b9747d..7b3b1d5b2 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -191,6 +191,7 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } +#[logging_timer::time] fn resolve_universe( ctx: &mut SearchContext, initial_universe: &RoaringBitmap, @@ -556,6 +557,7 @@ pub fn execute_vector_search( } #[allow(clippy::too_many_arguments)] +#[logging_timer::time] pub fn execute_search( ctx: &mut SearchContext, query: Option<&str>, diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 64fe07a31..865075d97 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -5,6 +5,7 @@ use super::*; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. +#[logging_timer::time] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 540def4ca..e797c0380 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2553,7 +2553,7 @@ mod tests { /// Vectors must be of the same length. #[test] fn test_multiple_vectors() { - use crate::vector::settings::{EmbedderSettings, EmbeddingSettings}; + use crate::vector::settings::EmbeddingSettings; let index = TempIndex::new(); index @@ -2562,9 +2562,11 @@ mod tests { embedders.insert( "manual".to_string(), Setting::Set(EmbeddingSettings { - embedder_options: Setting::Set(EmbedderSettings::UserProvided( - crate::vector::settings::UserProvidedSettings { dimensions: 3 }, - )), + source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided), + model: Setting::NotSet, + revision: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(3), document_template: Setting::NotSet, }), ); @@ -2579,10 +2581,10 @@ mod tests { .unwrap(); index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap(); index - .add_documents( - documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]), - ) - .unwrap(); + .add_documents( + documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]), + ) + .unwrap(); let rtxn = index.read_txn().unwrap(); let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap(); diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index eb2b6e69a..66c52a52f 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,7 +8,7 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::settings::{Setting, Settings}; +pub use self::settings::{validate_embedding_settings, Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index d406c121c..d770bcd74 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -17,7 +17,7 @@ use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::vector::settings::{EmbeddingSettings, PromptSettings}; +use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldsIdsMap, Index, OrderBy, Result}; @@ -78,11 +78,19 @@ impl Setting { } } - pub fn apply(&mut self, new: Self) { + /// Returns `true` if applying the new setting changed this setting + pub fn apply(&mut self, new: Self) -> bool + where + T: PartialEq + Eq, + { if let Setting::NotSet = new { - return; + return false; + } + if self == &new { + return false; } *self = new; + true } } @@ -950,17 +958,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) { match joined { + // updated config EitherOrBoth::Both((name, mut old), (_, new)) => { - old.apply(new); - let new = validate_prompt(&name, old)?; - changed = true; + changed |= old.apply(new); + let new = validate_embedding_settings(old, &name)?; new_configs.insert(name, new); } + // unchanged config EitherOrBoth::Left((name, setting)) => { new_configs.insert(name, setting); } - EitherOrBoth::Right((name, setting)) => { - let setting = validate_prompt(&name, setting)?; + // new config + EitherOrBoth::Right((name, mut setting)) => { + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; changed = true; new_configs.insert(name, setting); } @@ -1072,8 +1086,12 @@ fn validate_prompt( ) -> Result> { match new { Setting::Set(EmbeddingSettings { - embedder_options, - document_template: Setting::Set(PromptSettings { template: Setting::Set(template) }), + source, + model, + revision, + api_key, + dimensions, + document_template: Setting::Set(template), }) => { // validate let template = crate::prompt::Prompt::new(template) @@ -1081,16 +1099,71 @@ fn validate_prompt( .map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?; Ok(Setting::Set(EmbeddingSettings { - embedder_options, - document_template: Setting::Set(PromptSettings { - template: Setting::Set(template), - }), + source, + model, + revision, + api_key, + dimensions, + document_template: Setting::Set(template), })) } new => Ok(new), } } +pub fn validate_embedding_settings( + settings: Setting, + name: &str, +) -> Result> { + let settings = validate_prompt(name, settings)?; + let Setting::Set(settings) = settings else { return Ok(settings) }; + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + settings; + let Some(inferred_source) = source.set() else { + return Ok(Setting::Set(EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + })); + }; + match inferred_source { + EmbedderSource::OpenAi => { + check_unset(&revision, "revision", inferred_source, name)?; + check_unset(&dimensions, "dimensions", inferred_source, name)?; + if let Setting::Set(model) = &model { + crate::vector::openai::EmbeddingModel::from_name(model.as_str()).ok_or( + crate::error::UserError::InvalidOpenAiModel { + embedder_name: name.to_owned(), + model: model.clone(), + }, + )?; + } + } + EmbedderSource::HuggingFace => { + check_unset(&api_key, "apiKey", inferred_source, name)?; + check_unset(&dimensions, "dimensions", inferred_source, name)?; + } + EmbedderSource::UserProvided => { + check_unset(&model, "model", inferred_source, name)?; + check_unset(&revision, "revision", inferred_source, name)?; + check_unset(&api_key, "apiKey", inferred_source, name)?; + check_unset(&document_template, "documentTemplate", inferred_source, name)?; + check_set(&dimensions, "dimensions", inferred_source, name)?; + } + } + Ok(Setting::Set(EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + })) +} + #[cfg(test)] mod tests { use big_s::S; diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index c11e6ddc6..53e8a041b 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -34,6 +34,9 @@ pub struct EmbedderOptions { #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub enum EmbeddingModel { + // # WARNING + // + // If ever adding a model, make sure to add it to the list of supported models below. #[default] #[serde(rename = "text-embedding-ada-002")] #[deserr(rename = "text-embedding-ada-002")] @@ -41,6 +44,10 @@ pub enum EmbeddingModel { } impl EmbeddingModel { + pub fn supported_models() -> &'static [&'static str] { + &["text-embedding-ada-002"] + } + pub fn max_token(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 8191, @@ -59,7 +66,7 @@ impl EmbeddingModel { } } - pub fn from_name(name: &'static str) -> Option { + pub fn from_name(name: &str) -> Option { match name { "text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002), _ => None, diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 912cdf953..37fb80452 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -4,32 +4,189 @@ use serde::{Deserialize, Serialize}; use crate::prompt::PromptData; use crate::update::Setting; use crate::vector::EmbeddingConfig; +use crate::UserError; #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct EmbeddingSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "source")] - #[deserr(default, rename = "source")] - pub embedder_options: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] - pub document_template: Setting, + pub source: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub model: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub revision: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub api_key: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub dimensions: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub document_template: Setting, +} + +pub fn check_unset( + key: &Setting, + field: &'static str, + source: EmbedderSource, + embedder_name: &str, +) -> Result<(), UserError> { + if matches!(key, Setting::NotSet) { + Ok(()) + } else { + Err(UserError::InvalidFieldForSource { + embedder_name: embedder_name.to_owned(), + source_: source, + field, + allowed_fields_for_source: EmbeddingSettings::allowed_fields_for_source(source), + allowed_sources_for_field: EmbeddingSettings::allowed_sources_for_field(field), + }) + } +} + +pub fn check_set( + key: &Setting, + field: &'static str, + source: EmbedderSource, + embedder_name: &str, +) -> Result<(), UserError> { + if matches!(key, Setting::Set(_)) { + Ok(()) + } else { + Err(UserError::MissingFieldForSource { + field, + source_: source, + embedder_name: embedder_name.to_owned(), + }) + } +} + +impl EmbeddingSettings { + pub const SOURCE: &'static str = "source"; + pub const MODEL: &'static str = "model"; + pub const REVISION: &'static str = "revision"; + pub const API_KEY: &'static str = "apiKey"; + pub const DIMENSIONS: &'static str = "dimensions"; + pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate"; + + pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { + match field { + Self::SOURCE => { + &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::UserProvided] + } + Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + Self::REVISION => &[EmbedderSource::HuggingFace], + Self::API_KEY => &[EmbedderSource::OpenAi], + Self::DIMENSIONS => &[EmbedderSource::UserProvided], + Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], + _other => unreachable!("unknown field"), + } + } + + pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] { + match source { + EmbedderSource::OpenAi => { + &[Self::SOURCE, Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE] + } + EmbedderSource::HuggingFace => { + &[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE] + } + EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS], + } + } + + pub(crate) fn apply_default_source(setting: &mut Setting) { + if let Setting::Set(EmbeddingSettings { + source: source @ (Setting::NotSet | Setting::Reset), + .. + }) = setting + { + *source = Setting::Set(EmbedderSource::default()) + } + } +} + +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub enum EmbedderSource { + #[default] + OpenAi, + HuggingFace, + UserProvided, +} + +impl std::fmt::Display for EmbedderSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + EmbedderSource::OpenAi => "openAi", + EmbedderSource::HuggingFace => "huggingFace", + EmbedderSource::UserProvided => "userProvided", + }; + f.write_str(s) + } } impl EmbeddingSettings { pub fn apply(&mut self, new: Self) { - let EmbeddingSettings { embedder_options, document_template: prompt } = new; - self.embedder_options.apply(embedder_options); - self.document_template.apply(prompt); + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + new; + let old_source = self.source; + self.source.apply(source); + // Reinitialize the whole setting object on a source change + if old_source != self.source { + *self = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + }; + return; + } + + self.model.apply(model); + self.revision.apply(revision); + self.api_key.apply(api_key); + self.dimensions.apply(dimensions); + self.document_template.apply(document_template); } } impl From for EmbeddingSettings { fn from(value: EmbeddingConfig) -> Self { - Self { - embedder_options: Setting::Set(value.embedder_options.into()), - document_template: Setting::Set(value.prompt.into()), + let EmbeddingConfig { embedder_options, prompt } = value; + match embedder_options { + super::EmbedderOptions::HuggingFace(options) => Self { + source: Setting::Set(EmbedderSource::HuggingFace), + model: Setting::Set(options.model), + revision: options.revision.map(Setting::Set).unwrap_or_default(), + api_key: Setting::NotSet, + dimensions: Setting::NotSet, + document_template: Setting::Set(prompt.template), + }, + super::EmbedderOptions::OpenAi(options) => Self { + source: Setting::Set(EmbedderSource::OpenAi), + model: Setting::Set(options.embedding_model.name().to_owned()), + revision: Setting::NotSet, + api_key: options.api_key.map(Setting::Set).unwrap_or_default(), + dimensions: Setting::NotSet, + document_template: Setting::Set(prompt.template), + }, + super::EmbedderOptions::UserProvided(options) => Self { + source: Setting::Set(EmbedderSource::UserProvided), + model: Setting::NotSet, + revision: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(options.dimensions), + document_template: Setting::NotSet, + }, } } } @@ -37,256 +194,51 @@ impl From for EmbeddingSettings { impl From for EmbeddingConfig { fn from(value: EmbeddingSettings) -> Self { let mut this = Self::default(); - let EmbeddingSettings { embedder_options, document_template: prompt } = value; - if let Some(embedder_options) = embedder_options.set() { - this.embedder_options = embedder_options.into(); - } - if let Some(prompt) = prompt.set() { - this.prompt = prompt.into(); - } - this - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct PromptSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub template: Setting, -} - -impl PromptSettings { - pub fn apply(&mut self, new: Self) { - let PromptSettings { template } = new; - self.template.apply(template); - } -} - -impl From for PromptSettings { - fn from(value: PromptData) -> Self { - Self { template: Setting::Set(value.template) } - } -} - -impl From for PromptData { - fn from(value: PromptSettings) -> Self { - let mut this = PromptData::default(); - let PromptSettings { template } = value; - if let Some(template) = template.set() { - this.template = template; - } - this - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -pub enum EmbedderSettings { - HuggingFace(Setting), - OpenAi(Setting), - UserProvided(UserProvidedSettings), -} - -impl Deserr for EmbedderSettings -where - E: deserr::DeserializeError, -{ - fn deserialize_from_value( - value: deserr::Value, - location: deserr::ValuePointerRef, - ) -> Result { - match value { - deserr::Value::Map(map) => { - if deserr::Map::len(&map) != 1 { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "Expected a single field, got {} fields", - deserr::Map::len(&map) - ), - }, - location, - ))); + let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = + value; + if let Some(source) = source.set() { + match source { + EmbedderSource::OpenAi => { + let mut options = super::openai::EmbedderOptions::with_default_model(None); + if let Some(model) = model.set() { + if let Some(model) = super::openai::EmbeddingModel::from_name(&model) { + options.embedding_model = model; + } + } + if let Some(api_key) = api_key.set() { + options.api_key = Some(api_key); + } + this.embedder_options = super::EmbedderOptions::OpenAi(options); } - let mut it = deserr::Map::into_iter(map); - let (k, v) = it.next().unwrap(); - - match k.as_str() { - "huggingFace" => Ok(EmbedderSettings::HuggingFace(Setting::Set( - HfEmbedderSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - ))), - "openAi" => Ok(EmbedderSettings::OpenAi(Setting::Set( - OpenAiEmbedderSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - ))), - "userProvided" => Ok(EmbedderSettings::UserProvided( - UserProvidedSettings::deserialize_from_value( - v.into_value(), - location.push_key(&k), - )?, - )), - other => Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::UnknownKey { - key: other, - accepted: &["huggingFace", "openAi", "userProvided"], - }, - location, - ))), + EmbedderSource::HuggingFace => { + let mut options = super::hf::EmbedderOptions::default(); + if let Some(model) = model.set() { + options.model = model; + // Reset the revision if we are setting the model. + // This allows the following: + // "huggingFace": {} -> default model with default revision + // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision + // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision + options.revision = None; + } + if let Some(revision) = revision.set() { + options.revision = Some(revision); + } + this.embedder_options = super::EmbedderOptions::HuggingFace(options); + } + EmbedderSource::UserProvided => { + this.embedder_options = + super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { + dimensions: dimensions.set().unwrap(), + }); } } - _ => Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::IncorrectValueKind { - actual: value, - accepted: &[deserr::ValueKind::Map], - }, - location, - ))), } - } -} -impl Default for EmbedderSettings { - fn default() -> Self { - Self::OpenAi(Default::default()) - } -} - -impl From for EmbedderSettings { - fn from(value: crate::vector::EmbedderOptions) -> Self { - match value { - crate::vector::EmbedderOptions::HuggingFace(hf) => { - Self::HuggingFace(Setting::Set(hf.into())) - } - crate::vector::EmbedderOptions::OpenAi(openai) => { - Self::OpenAi(Setting::Set(openai.into())) - } - crate::vector::EmbedderOptions::UserProvided(user_provided) => { - Self::UserProvided(user_provided.into()) - } + if let Setting::Set(template) = document_template { + this.prompt = PromptData { template } } - } -} -impl From for crate::vector::EmbedderOptions { - fn from(value: EmbedderSettings) -> Self { - match value { - EmbedderSettings::HuggingFace(Setting::Set(hf)) => Self::HuggingFace(hf.into()), - EmbedderSettings::HuggingFace(_setting) => Self::HuggingFace(Default::default()), - EmbedderSettings::OpenAi(Setting::Set(ai)) => Self::OpenAi(ai.into()), - EmbedderSettings::OpenAi(_setting) => { - Self::OpenAi(crate::vector::openai::EmbedderOptions::with_default_model(None)) - } - EmbedderSettings::UserProvided(user_provided) => { - Self::UserProvided(user_provided.into()) - } - } - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct HfEmbedderSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub model: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub revision: Setting, -} - -impl HfEmbedderSettings { - pub fn apply(&mut self, new: Self) { - let HfEmbedderSettings { model, revision } = new; - self.model.apply(model); - self.revision.apply(revision); - } -} - -impl From for HfEmbedderSettings { - fn from(value: crate::vector::hf::EmbedderOptions) -> Self { - Self { - model: Setting::Set(value.model), - revision: value.revision.map(Setting::Set).unwrap_or(Setting::NotSet), - } - } -} - -impl From for crate::vector::hf::EmbedderOptions { - fn from(value: HfEmbedderSettings) -> Self { - let HfEmbedderSettings { model, revision } = value; - let mut this = Self::default(); - if let Some(model) = model.set() { - this.model = model; - } - if let Some(revision) = revision.set() { - this.revision = Some(revision); - } this } } - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct OpenAiEmbedderSettings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub api_key: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "model")] - #[deserr(default, rename = "model")] - pub embedding_model: Setting, -} - -impl OpenAiEmbedderSettings { - pub fn apply(&mut self, new: Self) { - let Self { api_key, embedding_model: embedding_mode } = new; - self.api_key.apply(api_key); - self.embedding_model.apply(embedding_mode); - } -} - -impl From for OpenAiEmbedderSettings { - fn from(value: crate::vector::openai::EmbedderOptions) -> Self { - Self { - api_key: value.api_key.map(Setting::Set).unwrap_or(Setting::Reset), - embedding_model: Setting::Set(value.embedding_model), - } - } -} - -impl From for crate::vector::openai::EmbedderOptions { - fn from(value: OpenAiEmbedderSettings) -> Self { - let OpenAiEmbedderSettings { api_key, embedding_model } = value; - Self { api_key: api_key.set(), embedding_model: embedding_model.set().unwrap_or_default() } - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] -#[serde(deny_unknown_fields, rename_all = "camelCase")] -#[deserr(rename_all = camelCase, deny_unknown_fields)] -pub struct UserProvidedSettings { - pub dimensions: usize, -} - -impl From for crate::vector::manual::EmbedderOptions { - fn from(value: UserProvidedSettings) -> Self { - Self { dimensions: value.dimensions } - } -} - -impl From for UserProvidedSettings { - fn from(value: crate::vector::manual::EmbedderOptions) -> Self { - Self { dimensions: value.dimensions } - } -}