diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 016a9d282..c53946fea 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -65,9 +65,9 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] include: - - os: macos-12 + - os: macos-13 artifact_name: meilisearch asset_name: meilisearch-macos-amd64 - os: windows-2022 @@ -90,7 +90,7 @@ jobs: publish-macos-apple-silicon: name: Publish binary for macOS silicon - runs-on: macos-12 + runs-on: macos-13 needs: check-version strategy: matrix: diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index ce7fb30b6..90fb03538 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -51,7 +51,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] steps: - uses: actions/checkout@v3 - name: Cache dependencies diff --git a/Cargo.lock b/Cargo.lock index bcca35173..500f28454 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,8 +386,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arroy" -version = "0.4.0" -source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e" dependencies = [ "bytemuck", "byteorder", @@ -3414,6 +3415,7 @@ dependencies = [ "meilisearch-types", "mimalloc", "mime", + "mopa-maintained", "num_cpus", "obkv", "once_cell", @@ -3680,6 +3682,12 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "mopa-maintained" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11" + [[package]] name = "mutually_exclusive_features" version = "0.0.3" @@ -4581,9 +4589,8 @@ dependencies = [ [[package]] name = "rhai" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702" +version = "1.20.0" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4600,8 +4607,7 @@ dependencies = [ [[package]] name = "rhai_codegen" version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "proc-macro2", "quote", diff --git a/bors.toml b/bors.toml index 8750ed993..96e9ef65e 100644 --- a/bors.toml +++ b/bors.toml @@ -1,6 +1,6 @@ status = [ 'Tests on ubuntu-20.04', - 'Tests on macos-12', + 'Tests on macos-13', 'Tests on windows-2022', 'Run Clippy', 'Run Rustfmt', diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 432a86382..e80311005 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -40,7 +40,7 @@ ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } [dev-dependencies] -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.39.0", features = ["json", "redactions"] } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index fe8244f9b..e0e2bfb75 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1263,7 +1263,7 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - tracing::info!("Batch failed {}", error); + tracing::error!("Batch failed {}", error); self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index cb4937e57..0dae024f2 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -66,5 +66,8 @@ khmer = ["milli/khmer"] vietnamese = ["milli/vietnamese"] # force swedish character recomposition swedish-recomposition = ["milli/swedish-recomposition"] -# force german character recomposition +# allow german tokenization german = ["milli/german"] +# allow turkish normalization +turkish = ["milli/turkish"] + diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 2a16e1017..57202f59f 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [ rustls = { version = "0.23.11", features = ["ring"], default-features = false } rustls-pki-types = { version = "1.7.0", features = ["alloc"] } rustls-pemfile = "2.1.2" -segment = { version = "0.2.4", optional = true } +segment = { version = "0.2.4" } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" @@ -104,6 +104,7 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } roaring = "0.10.2" +mopa-maintained = "0.2.3" [dev-dependencies] actix-rt = "2.10.0" @@ -131,8 +132,7 @@ tempfile = { version = "3.10.1", optional = true } zip = { version = "2.1.3", optional = true } [features] -default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"] -analytics = ["segment"] +default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] mini-dashboard = [ "static-files", "anyhow", @@ -154,7 +154,8 @@ khmer = ["meilisearch-types/khmer"] vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] german = ["meilisearch-types/german"] +turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" -sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" +sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" diff --git a/crates/meilisearch/src/analytics/mod.rs b/crates/meilisearch/src/analytics/mod.rs index 3c7ca0ed3..bd14b0bfa 100644 --- a/crates/meilisearch/src/analytics/mod.rs +++ b/crates/meilisearch/src/analytics/mod.rs @@ -1,44 +1,45 @@ -mod mock_analytics; -#[cfg(feature = "analytics")] -mod segment_analytics; +pub mod segment_analytics; use std::fs; use std::path::{Path, PathBuf}; use std::str::FromStr; +use std::sync::Arc; use actix_web::HttpRequest; +use index_scheduler::IndexScheduler; +use meilisearch_auth::AuthController; use meilisearch_types::InstanceUid; -pub use mock_analytics::MockAnalytics; +use mopa::mopafy; use once_cell::sync::Lazy; use platform_dirs::AppDirs; -use serde_json::Value; - -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; - -// if the analytics feature is disabled -// the `SegmentAnalytics` point to the mock instead of the real analytics -#[cfg(not(feature = "analytics"))] -pub type SegmentAnalytics = mock_analytics::MockAnalytics; -#[cfg(not(feature = "analytics"))] -pub type SearchAggregator = mock_analytics::SearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type SimilarAggregator = mock_analytics::SimilarAggregator; -#[cfg(not(feature = "analytics"))] -pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator; // if the feature analytics is enabled we use the real analytics -#[cfg(feature = "analytics")] pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; -#[cfg(feature = "analytics")] -pub type SearchAggregator = segment_analytics::SearchAggregator; -#[cfg(feature = "analytics")] -pub type SimilarAggregator = segment_analytics::SimilarAggregator; -#[cfg(feature = "analytics")] -pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; -#[cfg(feature = "analytics")] -pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; + +use crate::Opt; + +/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. +#[macro_export] +macro_rules! empty_analytics { + ($struct_name:ident, $event_name:literal) => { + #[derive(Default)] + struct $struct_name {} + + impl $crate::analytics::Aggregate for $struct_name { + fn event_name(&self) -> &'static str { + $event_name + } + + fn aggregate(self: Box, _other: Box) -> Box { + self + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::json!({}) + } + } + }; +} /// The Meilisearch config dir: /// `~/.config/Meilisearch` on *NIX or *BSD. @@ -78,60 +79,88 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -pub trait Analytics: Sync + Send { - fn instance_uid(&self) -> Option<&InstanceUid>; +/// To send an event to segment, your event must be able to aggregate itself with another event of the same type. +pub trait Aggregate: 'static + mopa::Any + Send { + /// The name of the event that will be sent to segment. + fn event_name(&self) -> &'static str; + + /// Will be called every time an event has been used twice before segment flushed its buffer. + fn aggregate(self: Box, new: Box) -> Box + where + Self: Sized; + + /// Converts your structure to the final event that'll be sent to segment. + fn into_event(self: Box) -> serde_json::Value; +} + +mopafy!(Aggregate); + +/// Helper trait to define multiple aggregates with the same content but a different name. +/// Commonly used when you must aggregate a search with POST or with GET, for example. +pub trait AggregateMethod: 'static + Default + Send { + fn event_name() -> &'static str; +} + +/// A macro used to quickly define multiple aggregate method with their name +/// Usage: +/// ```rust +/// use meilisearch::aggregate_methods; +/// +/// aggregate_methods!( +/// SearchGET => "Documents Searched GET", +/// SearchPOST => "Documents Searched POST", +/// ); +/// ``` +#[macro_export] +macro_rules! aggregate_methods { + ($method:ident => $event_name:literal) => { + #[derive(Default)] + pub struct $method {} + + impl $crate::analytics::AggregateMethod for $method { + fn event_name() -> &'static str { + $event_name + } + } + }; + ($($method:ident => $event_name:literal,)+) => { + $( + aggregate_methods!($method => $event_name); + )+ + + }; +} + +#[derive(Clone)] +pub struct Analytics { + segment: Option>, +} + +impl Analytics { + pub async fn new( + opt: &Opt, + index_scheduler: Arc, + auth_controller: Arc, + ) -> Self { + if opt.no_analytics { + Self { segment: None } + } else { + Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await } + } + } + + pub fn no_analytics() -> Self { + Self { segment: None } + } + + pub fn instance_uid(&self) -> Option<&InstanceUid> { + self.segment.as_ref().map(|segment| segment.instance_uid.as_ref()) + } /// The method used to publish most analytics that do not need to be batched every hours - fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>); - - /// This method should be called to aggregate a get search - fn get_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a post search - fn post_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a get similar request - fn get_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post similar request - fn post_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post array of searches - fn post_multi_search(&self, aggregate: MultiSearchAggregator); - - /// This method should be called to aggregate post facet values searches - fn post_facet_search(&self, aggregate: FacetSearchAggregator); - - // this method should be called to aggregate an add documents request - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to aggregate a fetch documents request - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a fetch documents request - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a add documents request - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest); - - // this method should be called to batch an update documents request - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to batch an update documents by function request - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ); + pub fn publish(&self, event: T, request: &HttpRequest) { + if let Some(ref segment) = self.segment { + let _ = segment.sender.try_send(segment_analytics::Message::new(event, request)); + } + } } diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index f8d6a0fdc..7dc746b14 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -1,19 +1,18 @@ -use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; +use std::any::TypeId; +use std::collections::{HashMap, HashSet}; use std::fs; -use std::mem::take; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, Instant}; -use actix_web::http::header::{CONTENT_TYPE, USER_AGENT}; +use actix_web::http::header::USER_AGENT; use actix_web::HttpRequest; use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; -use meilisearch_types::locales::Locale; +use meilisearch_types::features::RuntimeTogglableFeatures; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; -use regex::Regex; use segment::message::{Identify, Track, User}; use segment::{AutoBatcher, Batcher, HttpClient}; use serde::Serialize; @@ -24,22 +23,11 @@ use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; -use super::{ - config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, -}; -use crate::analytics::Analytics; +use super::{config_user_id_path, Aggregate, MEILISEARCH_CONFIG_PATH}; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; -use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::{create_all_stats, Stats}; -use crate::search::{ - FacetSearchResult, FederatedSearch, MatchingStrategy, SearchQuery, SearchQueryWithIndex, - SearchResult, SimilarQuery, SimilarResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEMANTIC_RATIO, -}; use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; @@ -57,7 +45,7 @@ fn write_user_id(db_path: &Path, user_id: &InstanceUid) { const SEGMENT_API_KEY: &str = "P3FWhhEsJiEDCuEHpmcN9DHcK4hVfBvb"; -pub fn extract_user_agents(request: &HttpRequest) -> Vec { +pub fn extract_user_agents(request: &HttpRequest) -> HashSet { request .headers() .get(ANALYTICS_HEADER) @@ -70,26 +58,58 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { .collect() } -pub enum AnalyticsMsg { - BatchMessage(Track), - AggregateGetSearch(SearchAggregator), - AggregatePostSearch(SearchAggregator), - AggregateGetSimilar(SimilarAggregator), - AggregatePostSimilar(SimilarAggregator), - AggregatePostMultiSearch(MultiSearchAggregator), - AggregatePostFacetSearch(FacetSearchAggregator), - AggregateAddDocuments(DocumentsAggregator), - AggregateDeleteDocuments(DocumentsDeletionAggregator), - AggregateUpdateDocuments(DocumentsAggregator), - AggregateEditDocumentsByFunction(EditDocumentsByFunctionAggregator), - AggregateGetFetchDocuments(DocumentsFetchAggregator), - AggregatePostFetchDocuments(DocumentsFetchAggregator), +pub struct Message { + // Since the type_id is solved statically we cannot retrieve it from the Box. + // Thus we have to send it in the message directly. + type_id: TypeId, + // Same for the aggregate function. + #[allow(clippy::type_complexity)] + aggregator_function: fn(Box, Box) -> Option>, + event: Event, +} + +pub struct Event { + original: Box, + timestamp: OffsetDateTime, + user_agents: HashSet, + total: usize, +} + +/// This function should always be called on the same type. If `this` and `other` +/// aren't the same type the function will do nothing and return `None`. +fn downcast_aggregate( + old: Box, + new: Box, +) -> Option> { + if old.is::() && new.is::() { + // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping + let this = old.downcast::().ok()?; + let other = new.downcast::().ok()?; + Some(ConcreteType::aggregate(this, other)) + } else { + None + } +} + +impl Message { + pub fn new(event: T, request: &HttpRequest) -> Self { + Self { + type_id: TypeId::of::(), + event: Event { + original: Box::new(event), + timestamp: OffsetDateTime::now_utc(), + user_agents: extract_user_agents(request), + total: 1, + }, + aggregator_function: downcast_aggregate::, + } + } } pub struct SegmentAnalytics { - instance_uid: InstanceUid, - sender: Sender, - user: User, + pub instance_uid: InstanceUid, + pub user: User, + pub sender: Sender, } impl SegmentAnalytics { @@ -98,7 +118,7 @@ impl SegmentAnalytics { opt: &Opt, index_scheduler: Arc, auth_controller: Arc, - ) -> Arc { + ) -> Option> { let instance_uid = super::find_user_id(&opt.db_path); let first_time_run = instance_uid.is_none(); let instance_uid = instance_uid.unwrap_or_else(Uuid::new_v4); @@ -108,7 +128,7 @@ impl SegmentAnalytics { // if reqwest throws an error we won't be able to send analytics if client.is_err() { - return super::MockAnalytics::new(opt); + return None; } let client = @@ -144,113 +164,13 @@ impl SegmentAnalytics { user: user.clone(), opt: opt.clone(), batcher, - post_search_aggregator: SearchAggregator::default(), - post_multi_search_aggregator: MultiSearchAggregator::default(), - post_facet_search_aggregator: FacetSearchAggregator::default(), - get_search_aggregator: SearchAggregator::default(), - add_documents_aggregator: DocumentsAggregator::default(), - delete_documents_aggregator: DocumentsDeletionAggregator::default(), - update_documents_aggregator: DocumentsAggregator::default(), - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator::default(), - get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - get_similar_aggregator: SimilarAggregator::default(), - post_similar_aggregator: SimilarAggregator::default(), + events: HashMap::new(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); let this = Self { instance_uid, sender, user: user.clone() }; - Arc::new(this) - } -} - -impl super::Analytics for SegmentAnalytics { - fn instance_uid(&self) -> Option<&InstanceUid> { - Some(&self.instance_uid) - } - - fn publish(&self, event_name: String, mut send: Value, request: Option<&HttpRequest>) { - let user_agent = request.map(extract_user_agents); - - send["user-agent"] = json!(user_agent); - let event = Track { - user: self.user.clone(), - event: event_name.clone(), - properties: send, - ..Default::default() - }; - let _ = self.sender.try_send(AnalyticsMsg::BatchMessage(event)); - } - - fn get_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSearch(aggregate)); - } - - fn post_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSearch(aggregate)); - } - - fn get_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSimilar(aggregate)); - } - - fn post_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSimilar(aggregate)); - } - - fn post_facet_search(&self, aggregate: FacetSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFacetSearch(aggregate)); - } - - fn post_multi_search(&self, aggregate: MultiSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostMultiSearch(aggregate)); - } - - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateAddDocuments(aggregate)); - } - - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest) { - let aggregate = DocumentsDeletionAggregator::from_query(kind, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateDeleteDocuments(aggregate)); - } - - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateUpdateDocuments(aggregate)); - } - - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = - EditDocumentsByFunctionAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateEditDocumentsByFunction(aggregate)); - } - - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetFetchDocuments(aggregate)); - } - - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); + Some(Arc::new(this)) } } @@ -263,8 +183,12 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_contains_filter: bool, + experimental_vector_store: bool, experimental_enable_metrics: bool, + experimental_edit_documents_by_function: bool, experimental_search_queue_size: usize, + experimental_drop_search_after: usize, + experimental_nb_searches_per_core: usize, experimental_logs_mode: LogMode, experimental_replication_parameters: bool, experimental_enable_logs_route: bool, @@ -298,8 +222,8 @@ struct Infos { ssl_tickets: bool, } -impl From for Infos { - fn from(options: Opt) -> Self { +impl Infos { + pub fn new(options: Opt, features: RuntimeTogglableFeatures) -> Self { // We wants to decompose this whole struct by hand to be sure we don't forget // to add analytics when we add a field in the Opt. // Thus we must not insert `..` at the end. @@ -308,6 +232,8 @@ impl From for Infos { experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, @@ -340,8 +266,7 @@ impl From for Infos { log_level, indexer_options, config_file_path, - #[cfg(feature = "analytics")] - no_analytics: _, + no_analytics: _, } = options; let schedule_snapshot = match schedule_snapshot { @@ -352,16 +277,28 @@ impl From for Infos { let IndexerOpts { max_indexing_memory, max_indexing_threads, skip_index_budget: _ } = indexer_options; + let RuntimeTogglableFeatures { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + } = features; + // We're going to override every sensible information. // We consider information sensible if it contains a path, an address, or a key. Self { env, - experimental_contains_filter, - experimental_enable_metrics, + experimental_contains_filter: experimental_contains_filter | contains_filter, + experimental_vector_store: vector_store, + experimental_edit_documents_by_function: edit_documents_by_function, + experimental_enable_metrics: experimental_enable_metrics | metrics, experimental_search_queue_size, + experimental_drop_search_after: experimental_drop_search_after.into(), + experimental_nb_searches_per_core: experimental_nb_searches_per_core.into(), experimental_logs_mode, experimental_replication_parameters, - experimental_enable_logs_route, + experimental_enable_logs_route: experimental_enable_logs_route | logs_route, experimental_reduce_indexing_memory_usage, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), @@ -395,26 +332,15 @@ impl From for Infos { } pub struct Segment { - inbox: Receiver, + inbox: Receiver, user: User, opt: Opt, batcher: AutoBatcher, - get_search_aggregator: SearchAggregator, - post_search_aggregator: SearchAggregator, - post_multi_search_aggregator: MultiSearchAggregator, - post_facet_search_aggregator: FacetSearchAggregator, - add_documents_aggregator: DocumentsAggregator, - delete_documents_aggregator: DocumentsDeletionAggregator, - update_documents_aggregator: DocumentsAggregator, - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator, - get_fetch_documents_aggregator: DocumentsFetchAggregator, - post_fetch_documents_aggregator: DocumentsFetchAggregator, - get_similar_aggregator: SimilarAggregator, - post_similar_aggregator: SimilarAggregator, + events: HashMap, } impl Segment { - fn compute_traits(opt: &Opt, stats: Stats) -> Value { + fn compute_traits(opt: &Opt, stats: Stats, features: RuntimeTogglableFeatures) -> Value { static FIRST_START_TIMESTAMP: Lazy = Lazy::new(Instant::now); static SYSTEM: Lazy = Lazy::new(|| { let disks = Disks::new_with_refreshed_list(); @@ -442,7 +368,7 @@ impl Segment { "indexes_number": stats.indexes.len(), "documents_number": number_of_documents, }, - "infos": Infos::from(opt.clone()), + "infos": Infos::new(opt.clone(), features), }) } @@ -461,36 +387,44 @@ impl Segment { _ = interval.tick() => { self.tick(index_scheduler.clone(), auth_controller.clone()).await; }, - msg = self.inbox.recv() => { - match msg { - Some(AnalyticsMsg::BatchMessage(msg)) => drop(self.batcher.push(msg).await), - Some(AnalyticsMsg::AggregateGetSearch(agreg)) => self.get_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSearch(agreg)) => self.post_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostMultiSearch(agreg)) => self.post_multi_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFacetSearch(agreg)) => self.post_facet_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateEditDocumentsByFunction(agreg)) => self.edit_documents_by_function_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetSimilar(agreg)) => self.get_similar_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSimilar(agreg)) => self.post_similar_aggregator.aggregate(agreg), - None => (), - } - } + Some(msg) = self.inbox.recv() => { + self.handle_msg(msg); + } } } } + fn handle_msg(&mut self, Message { type_id, aggregator_function, event }: Message) { + let new_event = match self.events.remove(&type_id) { + Some(old) => { + // The function should never fail since we retrieved the corresponding TypeId in the map. But in the unfortunate + // case it could happens we're going to silently ignore the error + let Some(original) = (aggregator_function)(old.original, event.original) else { + return; + }; + Event { + original, + // We always want to return the FIRST timestamp ever encountered + timestamp: old.timestamp, + user_agents: old.user_agents.union(&event.user_agents).cloned().collect(), + total: old.total.saturating_add(event.total), + } + } + None => event, + }; + self.events.insert(type_id, new_event); + } + async fn tick( &mut self, index_scheduler: Arc, auth_controller: Arc, ) { - if let Ok(stats) = - create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default()) - { + if let Ok(stats) = create_all_stats( + index_scheduler.clone().into(), + auth_controller.into(), + &AuthFilter::default(), + ) { // Replace the version number with the prototype name if any. let version = if let Some(prototype) = build_info::DescribeResult::from_build() .and_then(|describe| describe.as_prototype()) @@ -509,1486 +443,42 @@ impl Segment { }, })), user: self.user.clone(), - traits: Self::compute_traits(&self.opt, stats), + traits: Self::compute_traits( + &self.opt, + stats, + index_scheduler.features().runtime_features(), + ), ..Default::default() }) .await; } - let Segment { - inbox: _, - opt: _, - batcher: _, - user, - get_search_aggregator, - post_search_aggregator, - post_multi_search_aggregator, - post_facet_search_aggregator, - add_documents_aggregator, - delete_documents_aggregator, - update_documents_aggregator, - edit_documents_by_function_aggregator, - get_fetch_documents_aggregator, - post_fetch_documents_aggregator, - get_similar_aggregator, - post_similar_aggregator, - } = self; + // We empty the list of events + let events = std::mem::take(&mut self.events); - if let Some(get_search) = - take(get_search_aggregator).into_event(user, "Documents Searched GET") - { - let _ = self.batcher.push(get_search).await; - } - if let Some(post_search) = - take(post_search_aggregator).into_event(user, "Documents Searched POST") - { - let _ = self.batcher.push(post_search).await; - } - if let Some(post_multi_search) = take(post_multi_search_aggregator) - .into_event(user, "Documents Searched by Multi-Search POST") - { - let _ = self.batcher.push(post_multi_search).await; - } - if let Some(post_facet_search) = - take(post_facet_search_aggregator).into_event(user, "Facet Searched POST") - { - let _ = self.batcher.push(post_facet_search).await; - } - if let Some(add_documents) = - take(add_documents_aggregator).into_event(user, "Documents Added") - { - let _ = self.batcher.push(add_documents).await; - } - if let Some(delete_documents) = - take(delete_documents_aggregator).into_event(user, "Documents Deleted") - { - let _ = self.batcher.push(delete_documents).await; - } - if let Some(update_documents) = - take(update_documents_aggregator).into_event(user, "Documents Updated") - { - let _ = self.batcher.push(update_documents).await; - } - if let Some(edit_documents_by_function) = take(edit_documents_by_function_aggregator) - .into_event(user, "Documents Edited By Function") - { - let _ = self.batcher.push(edit_documents_by_function).await; - } - if let Some(get_fetch_documents) = - take(get_fetch_documents_aggregator).into_event(user, "Documents Fetched GET") - { - let _ = self.batcher.push(get_fetch_documents).await; - } - if let Some(post_fetch_documents) = - take(post_fetch_documents_aggregator).into_event(user, "Documents Fetched POST") - { - let _ = self.batcher.push(post_fetch_documents).await; + for (_, event) in events { + let Event { original, timestamp, user_agents, total } = event; + let name = original.event_name(); + let mut properties = original.into_event(); + if properties["user-agent"].is_null() { + properties["user-agent"] = json!(user_agents); + }; + if properties["requests"]["total_received"].is_null() { + properties["requests"]["total_received"] = total.into(); + }; + + let _ = self + .batcher + .push(Track { + user: self.user.clone(), + event: name.to_string(), + properties, + timestamp: Some(timestamp), + ..Default::default() + }) + .await; } - if let Some(get_similar_documents) = - take(get_similar_aggregator).into_event(user, "Similar GET") - { - let _ = self.batcher.push(get_similar_documents).await; - } - - if let Some(post_similar_documents) = - take(post_similar_aggregator).into_event(user, "Similar POST") - { - let _ = self.batcher.push(post_similar_documents).await; - } let _ = self.batcher.flush().await; } } - -#[derive(Default)] -pub struct SearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - total_degraded: usize, - total_used_negative_operator: usize, - time_spent: BinaryHeap, - - // sort - sort_with_geo_point: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - sort_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - sort_total_number_of_criteria: usize, - - // distinct - distinct: bool, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // attributes_to_search_on - // every time a search is done using attributes_to_search_on - attributes_to_search_on_total_number_of_uses: usize, - - // q - // The maximum number of terms in a q request - max_terms_number: usize, - - // vector - // The maximum number of floats in a vector request - max_vector_size: usize, - // Whether the semantic ratio passed to a hybrid search equals the default ratio. - semantic_ratio: bool, - hybrid: bool, - retrieve_vectors: bool, - - // every time a search is done, we increment the counter linked to the used settings - matching_strategy: HashMap, - - // List of the unique Locales passed as parameter - locales: BTreeSet, - - // pagination - max_limit: usize, - max_offset: usize, - finite_pagination: usize, - - // formatting - max_attributes_to_retrieve: usize, - max_attributes_to_highlight: usize, - highlight_pre_tag: bool, - highlight_post_tag: bool, - max_attributes_to_crop: usize, - crop_marker: bool, - show_matches_position: bool, - crop_length: bool, - - // facets - facets_sum_of_terms: usize, - facets_total_number_of_facets: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, -} - -impl SearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SearchQuery, request: &HttpRequest) -> Self { - let SearchQuery { - q, - vector, - offset, - limit, - page, - hits_per_page, - attributes_to_retrieve: _, - retrieve_vectors, - attributes_to_crop: _, - crop_length, - attributes_to_highlight: _, - show_matches_position, - show_ranking_score, - show_ranking_score_details, - filter, - sort, - distinct, - facets: _, - highlight_pre_tag, - highlight_post_tag, - crop_marker, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - - if let Some(ref sort) = sort { - ret.sort_total_number_of_criteria = 1; - ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); - ret.sort_sum_of_criteria_terms = sort.len(); - } - - ret.distinct = distinct.is_some(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - // attributes_to_search_on - if attributes_to_search_on.is_some() { - ret.attributes_to_search_on_total_number_of_uses = 1; - } - - if let Some(ref q) = q { - ret.max_terms_number = q.split_whitespace().count(); - } - - if let Some(ref vector) = vector { - ret.max_vector_size = vector.len(); - } - ret.retrieve_vectors |= retrieve_vectors; - - if query.is_finite_pagination() { - let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); - ret.max_limit = limit; - ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; - ret.finite_pagination = 1; - } else { - ret.max_limit = *limit; - ret.max_offset = *offset; - ret.finite_pagination = 0; - } - - ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); - - if let Some(locales) = locales { - ret.locales = locales.iter().copied().collect(); - } - - ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); - ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); - ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); - ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); - ret.show_matches_position = *show_matches_position; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - if let Some(hybrid) = hybrid { - ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); - ret.hybrid = true; - } - - ret - } - - pub fn succeed(&mut self, result: &SearchResult) { - let SearchResult { - hits: _, - query: _, - processing_time_ms, - hits_info: _, - semantic_hit_count: _, - facet_distribution: _, - facet_stats: _, - degraded, - used_negative_operator, - } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - if *degraded { - self.total_degraded = self.total_degraded.saturating_add(1); - } - if *used_negative_operator { - self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); - } - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [SearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - ref mut locales, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.total_degraded = self.total_degraded.saturating_add(total_degraded); - self.total_used_negative_operator = - self.total_used_negative_operator.saturating_add(total_used_negative_operator); - self.time_spent.append(time_spent); - - // sort - self.sort_with_geo_point |= sort_with_geo_point; - self.sort_sum_of_criteria_terms = - self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); - self.sort_total_number_of_criteria = - self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); - - // distinct - self.distinct |= distinct; - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - // attributes_to_search_on - self.attributes_to_search_on_total_number_of_uses = self - .attributes_to_search_on_total_number_of_uses - .saturating_add(attributes_to_search_on_total_number_of_uses); - - // q - self.max_terms_number = self.max_terms_number.max(max_terms_number); - - // vector - self.max_vector_size = self.max_vector_size.max(max_vector_size); - self.retrieve_vectors |= retrieve_vectors; - self.semantic_ratio |= semantic_ratio; - self.hybrid |= hybrid; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - self.finite_pagination += finite_pagination; - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - self.max_attributes_to_highlight = - self.max_attributes_to_highlight.max(max_attributes_to_highlight); - self.highlight_pre_tag |= highlight_pre_tag; - self.highlight_post_tag |= highlight_post_tag; - self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); - self.crop_marker |= crop_marker; - self.show_matches_position |= show_matches_position; - self.crop_length |= crop_length; - - // facets - self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); - self.facets_total_number_of_facets = - self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); - - // matching strategy - for (key, value) in matching_strategy.into_iter() { - let matching_strategy = self.matching_strategy.entry(key).or_insert(0); - *matching_strategy = matching_strategy.saturating_add(value); - } - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - // locales - self.locales.append(locales); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - locales, - } = self; - - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - "total_degraded": total_degraded, - "total_used_negative_operator": total_used_negative_operator, - }, - "sort": { - "with_geoPoint": sort_with_geo_point, - "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), - }, - "distinct": distinct, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "attributes_to_search_on": { - "total_number_of_uses": attributes_to_search_on_total_number_of_uses, - }, - "q": { - "max_terms_number": max_terms_number, - }, - "vector": { - "max_vector_size": max_vector_size, - "retrieve_vectors": retrieve_vectors, - }, - "hybrid": { - "enabled": hybrid, - "semantic_ratio": semantic_ratio, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - "max_attributes_to_highlight": max_attributes_to_highlight, - "highlight_pre_tag": highlight_pre_tag, - "highlight_post_tag": highlight_post_tag, - "max_attributes_to_crop": max_attributes_to_crop, - "crop_marker": crop_marker, - "show_matches_position": show_matches_position, - "crop_length": crop_length, - }, - "facets": { - "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), - }, - "matching_strategy": { - "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "locales": locales, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct MultiSearchAggregator { - timestamp: Option, - - // requests - total_received: usize, - total_succeeded: usize, - - // sum of the number of distinct indexes in each single request, use with total_received to compute an avg - total_distinct_index_count: usize, - // number of queries with a single index, use with total_received to compute a proportion - total_single_index: usize, - - // sum of the number of search queries in the requests, use with total_received to compute an average - total_search_count: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - - // federation - use_federation: bool, - - // context - user_agents: HashSet, -} - -impl MultiSearchAggregator { - pub fn from_federated_search( - federated_search: &FederatedSearch, - request: &HttpRequest, - ) -> Self { - let timestamp = Some(OffsetDateTime::now_utc()); - - let user_agents = extract_user_agents(request).into_iter().collect(); - - let use_federation = federated_search.federation.is_some(); - - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; - - index_uid.as_str() - }) - .collect(); - - let show_ranking_score = - federated_search.queries.iter().any(|query| query.show_ranking_score); - let show_ranking_score_details = - federated_search.queries.iter().any(|query| query.show_ranking_score_details); - - Self { - timestamp, - total_received: 1, - total_succeeded: 0, - total_distinct_index_count: distinct_indexes.len(), - total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, - total_search_count: federated_search.queries.len(), - show_ranking_score, - show_ranking_score_details, - user_agents, - use_federation, - } - } - - pub fn succeed(&mut self) { - self.total_succeeded = self.total_succeeded.saturating_add(1); - } - - /// Aggregate one [MultiSearchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - // write the aggregate in a way that will cause a compilation error if a field is added. - - // get ownership of self, replacing it by a default value. - let this = std::mem::take(self); - - let timestamp = this.timestamp.or(other.timestamp); - let total_received = this.total_received.saturating_add(other.total_received); - let total_succeeded = this.total_succeeded.saturating_add(other.total_succeeded); - let total_distinct_index_count = - this.total_distinct_index_count.saturating_add(other.total_distinct_index_count); - let total_single_index = this.total_single_index.saturating_add(other.total_single_index); - let total_search_count = this.total_search_count.saturating_add(other.total_search_count); - let show_ranking_score = this.show_ranking_score || other.show_ranking_score; - let show_ranking_score_details = - this.show_ranking_score_details || other.show_ranking_score_details; - let mut user_agents = this.user_agents; - let use_federation = this.use_federation || other.use_federation; - - for user_agent in other.user_agents.into_iter() { - user_agents.insert(user_agent); - } - - // need all fields or compile error - let mut aggregated = Self { - timestamp, - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - user_agents, - show_ranking_score, - show_ranking_score_details, - use_federation, - // do not add _ or ..Default::default() here - }; - - // replace the default self with the aggregated value - std::mem::swap(self, &mut aggregated); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - user_agents, - show_ranking_score, - show_ranking_score_details, - use_federation, - } = self; - - if total_received == 0 { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "requests": { - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "indexes": { - "total_single_index": total_single_index, - "total_distinct_index_count": total_distinct_index_count, - "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early - }, - "searches": { - "total_search_count": total_search_count, - "avg_search_count": (total_search_count as f64) / (total_received as f64), - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - }, - "federation": { - "use_federation": use_federation, - } - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct FacetSearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // The set of all facetNames that were used - facet_names: HashSet, - - // As there been any other parameter than the facetName or facetQuery ones? - additional_search_parameters_provided: bool, -} - -impl FacetSearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self { - let FacetSearchQuery { - facet_query: _, - facet_name, - vector, - q, - filter, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - ret.facet_names = Some(facet_name.clone()).into_iter().collect(); - - ret.additional_search_parameters_provided = q.is_some() - || vector.is_some() - || filter.is_some() - || *matching_strategy != MatchingStrategy::default() - || attributes_to_search_on.is_some() - || hybrid.is_some() - || ranking_score_threshold.is_some() - || locales.is_some(); - - ret - } - - pub fn succeed(&mut self, result: &FacetSearchResult) { - let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; - self.total_succeeded = self.total_succeeded.saturating_add(1); - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [FacetSearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - facet_names, - additional_search_parameters_provided, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); - - // facet_names - for facet_name in facet_names.into_iter() { - self.facet_names.insert(facet_name); - } - - // additional_search_parameters_provided - self.additional_search_parameters_provided |= additional_search_parameters_provided; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - facet_names, - additional_search_parameters_provided, - } = self; - - if total_received == 0 { - None - } else { - // the index of the 99th percentage of value - let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th as usize); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "facets": { - "total_distinct_facet_count": facet_names.len(), - "additional_search_parameters_provided": additional_search_parameters_provided, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct DocumentsAggregator { - timestamp: Option, - - // set to true when at least one request was received - updated: bool, - - // context - user_agents: HashSet, - - content_types: HashSet, - primary_keys: HashSet, - index_creation: bool, -} - -impl DocumentsAggregator { - pub fn from_query( - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let UpdateDocumentsQuery { primary_key, csv_delimiter: _ } = documents_query; - - let mut primary_keys = HashSet::new(); - if let Some(primary_key) = primary_key.clone() { - primary_keys.insert(primary_key); - } - - let mut content_types = HashSet::new(); - let content_type = request - .headers() - .get(CONTENT_TYPE) - .and_then(|s| s.to_str().ok()) - .unwrap_or("unknown") - .to_string(); - content_types.insert(content_type); - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - updated: true, - user_agents: extract_user_agents(request).into_iter().collect(), - content_types, - primary_keys, - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - self.updated |= updated; - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - for primary_key in primary_keys { - self.primary_keys.insert(primary_key); - } - for content_type in content_types { - self.content_types.insert(content_type); - } - self.index_creation |= index_creation; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - self; - - if !updated { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "payload_type": content_types, - "primary_key": primary_keys, - "index_creation": index_creation, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct EditDocumentsByFunctionAggregator { - timestamp: Option, - - // Set to true if at least one request was filtered - filtered: bool, - // Set to true if at least one request contained a context - with_context: bool, - - // context - user_agents: HashSet, - - index_creation: bool, -} - -impl EditDocumentsByFunctionAggregator { - pub fn from_query( - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let DocumentEditionByFunction { filter, context, function: _ } = documents_query; - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - filtered: filter.is_some(), - with_context: context.is_some(), - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.index_creation |= index_creation; - self.filtered |= filtered; - self.with_context |= with_context; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = self; - - let properties = json!({ - "user-agent": user_agents, - "filtered": filtered, - "with_context": with_context, - "index_creation": index_creation, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsDeletionAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - per_document_id: bool, - clear_all: bool, - per_batch: bool, - per_filter: bool, -} - -impl DocumentsDeletionAggregator { - pub fn from_query(kind: DocumentDeletionKind, request: &HttpRequest) -> Self { - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(kind, DocumentDeletionKind::PerDocumentId), - clear_all: matches!(kind, DocumentDeletionKind::ClearAll), - per_batch: matches!(kind, DocumentDeletionKind::PerBatch), - per_filter: matches!(kind, DocumentDeletionKind::PerFilter), - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - clear_all, - per_batch, - per_filter, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.clear_all |= clear_all; - self.per_batch |= per_batch; - self.per_filter |= per_filter; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsFetchAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - - // a call on ../documents/:doc_id - per_document_id: bool, - // if a filter was used - per_filter: bool, - - #[serde(rename = "vector.retrieve_vectors")] - retrieve_vectors: bool, - - // pagination - #[serde(rename = "pagination.max_limit")] - max_limit: usize, - #[serde(rename = "pagination.max_offset")] - max_offset: usize, -} - -impl DocumentsFetchAggregator { - pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset, retrieve_vectors) = match query { - DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), - DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { - (*limit, *offset, *retrieve_vectors) - } - }; - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), - per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), - max_limit: limit, - max_offset: offset, - retrieve_vectors, - } - } - - /// Aggregate one [DocumentsFetchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - per_filter, - max_limit, - max_offset, - retrieve_vectors, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.per_filter |= per_filter; - - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - self.retrieve_vectors |= retrieve_vectors; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - -#[derive(Default)] -pub struct SimilarAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // Whether a non-default embedder was specified - retrieve_vectors: bool, - - // pagination - max_limit: usize, - max_offset: usize, - - // formatting - max_attributes_to_retrieve: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, -} - -impl SimilarAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SimilarQuery, request: &HttpRequest) -> Self { - let SimilarQuery { - id: _, - embedder: _, - offset, - limit, - attributes_to_retrieve: _, - retrieve_vectors, - show_ranking_score, - show_ranking_score_details, - filter, - ranking_score_threshold, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - ret.max_limit = *limit; - ret.max_offset = *offset; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - ret.retrieve_vectors = *retrieve_vectors; - - ret - } - - pub fn succeed(&mut self, result: &SimilarResult) { - let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [SimilarAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - self.retrieve_vectors |= retrieve_vectors; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - } = self; - - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "vector": { - "retrieve_vectors": retrieve_vectors, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index b24f18fae..633ad2776 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -120,7 +120,7 @@ pub fn create_app( search_queue: Data, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, enable_dashboard: bool, ) -> actix_web::App< impl ServiceFactory< @@ -473,14 +473,14 @@ pub fn configure_data( search_queue: Data, opt: &Opt, (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, ) { let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize; config .app_data(index_scheduler) .app_data(auth) .app_data(search_queue) - .app_data(web::Data::from(analytics)) + .app_data(analytics) .app_data(web::Data::new(logs_route)) .app_data(web::Data::new(logs_stderr)) .app_data(web::Data::new(opt.clone())) diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index b66bfc5b8..c0652bf1e 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::thread::available_parallelism; +use std::time::Duration; use actix_web::http::KeepAlive; use actix_web::web::Data; @@ -123,19 +124,12 @@ async fn try_main() -> anyhow::Result<()> { let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?; - #[cfg(all(not(debug_assertions), feature = "analytics"))] - let analytics = if !opt.no_analytics { - analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone()) - .await - } else { - analytics::MockAnalytics::new(&opt) - }; - #[cfg(any(debug_assertions, not(feature = "analytics")))] - let analytics = analytics::MockAnalytics::new(&opt); + let analytics = + analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await; print_launch_resume(&opt, analytics.clone(), config_read_from); - run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?; + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) } @@ -145,16 +139,23 @@ async fn run_http( auth_controller: Arc, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Arc, ) -> anyhow::Result<()> { let enable_dashboard = &opt.env == "development"; let opt_clone = opt.clone(); let index_scheduler = Data::from(index_scheduler); let auth_controller = Data::from(auth_controller); + let analytics = Data::from(analytics); let search_queue = SearchQueue::new( opt.experimental_search_queue_size, - available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()), - ); + available_parallelism() + .unwrap_or(NonZeroUsize::new(2).unwrap()) + .checked_mul(opt.experimental_nb_searches_per_core) + .unwrap_or(NonZeroUsize::MAX), + ) + .with_time_to_abort(Duration::from_secs( + usize::from(opt.experimental_drop_search_after) as u64 + )); let search_queue = Data::new(search_queue); let http_server = HttpServer::new(move || { @@ -180,11 +181,7 @@ async fn run_http( Ok(()) } -pub fn print_launch_resume( - opt: &Opt, - analytics: Arc, - config_read_from: Option, -) { +pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option) { let build_info = build_info::BuildInfo::from_build(); let protocol = @@ -226,7 +223,6 @@ pub fn print_launch_resume( eprintln!("Prototype:\t\t{:?}", prototype); } - #[cfg(all(not(debug_assertions), feature = "analytics"))] { if !opt.no_analytics { eprintln!( diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 3799bdcb7..7e87a5a2c 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -2,7 +2,7 @@ use std::env::VarError; use std::ffi::OsStr; use std::fmt::Display; use std::io::{BufReader, Read}; -use std::num::ParseIntError; +use std::num::{NonZeroUsize, ParseIntError}; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; @@ -29,7 +29,6 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; -#[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH"; @@ -55,6 +54,8 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE"; +const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER"; +const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = @@ -208,7 +209,6 @@ pub struct Opt { /// Meilisearch automatically collects data from all instances that do not opt out using this flag. /// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted /// at any time. - #[cfg(feature = "analytics")] #[serde(default)] // we can't send true #[clap(long, env = MEILI_NO_ANALYTICS)] pub no_analytics: bool, @@ -357,10 +357,26 @@ pub struct Opt { /// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the /// queue is full it starts returning HTTP 503, Service Unavailable. /// The default value is 1000. - #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)] - #[serde(default)] + #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())] + #[serde(default = "default_experimental_search_queue_size")] pub experimental_search_queue_size: usize, + /// Experimental drop search after. For more information, see: + /// + /// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it. + /// The default value is 60. + #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] + #[serde(default = "default_drop_search_after")] + pub experimental_drop_search_after: NonZeroUsize, + + /// Experimental number of searches per core. For more information, see: + /// + /// Lets you customize how many search requests can run on each core concurrently. + /// The default value is 4. + #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] + #[serde(default = "default_nb_searches_per_core")] + pub experimental_nb_searches_per_core: NonZeroUsize, + /// Experimental logs mode feature. For more information, see: /// /// Change the mode of the logs on the console. @@ -407,7 +423,6 @@ pub struct Opt { impl Opt { /// Whether analytics should be enabled or not. - #[cfg(all(not(debug_assertions), feature = "analytics"))] pub fn analytics(&self) -> bool { !self.no_analytics } @@ -487,11 +502,12 @@ impl Opt { ignore_missing_dump: _, ignore_dump_if_db_exists: _, config_file_path: _, - #[cfg(feature = "analytics")] no_analytics, experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_enable_logs_route, experimental_replication_parameters, @@ -513,10 +529,7 @@ impl Opt { ); } - #[cfg(feature = "analytics")] - { - export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); - } + export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); export_to_env_if_not_present( MEILI_HTTP_PAYLOAD_SIZE_LIMIT, http_payload_size_limit.to_string(), @@ -559,6 +572,14 @@ impl Opt { MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, experimental_search_queue_size.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, + experimental_drop_search_after.to_string(), + ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, + experimental_nb_searches_per_core.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_LOGS_MODE, experimental_logs_mode.to_string(), @@ -890,6 +911,18 @@ fn default_dump_dir() -> PathBuf { PathBuf::from(DEFAULT_DUMP_DIR) } +fn default_experimental_search_queue_size() -> usize { + 1000 +} + +fn default_drop_search_after() -> NonZeroUsize { + NonZeroUsize::new(60).unwrap() +} + +fn default_nb_searches_per_core() -> NonZeroUsize { + NonZeroUsize::new(4).unwrap() +} + /// Indicates if a snapshot was scheduled, and if yes with which interval. #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)] pub enum ScheduleSnapshot { diff --git a/crates/meilisearch/src/routes/dump.rs b/crates/meilisearch/src/routes/dump.rs index 7f3cd06a5..c78dc4dad 100644 --- a/crates/meilisearch/src/routes/dump.rs +++ b/crates/meilisearch/src/routes/dump.rs @@ -4,7 +4,6 @@ use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -18,14 +17,16 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); } +crate::empty_analytics!(DumpAnalytics, "Dump Created"); + pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Dump Created".to_string(), json!({}), Some(&req)); + analytics.publish(DumpAnalytics::default(), &req); let task = KindWithContent::DumpCreation { keys: auth_controller.list_keys()?, diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index bc656bdbb..5d93adc02 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -6,10 +6,10 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; -use serde_json::json; +use serde::Serialize; use tracing::debug; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; @@ -17,7 +17,7 @@ use crate::extractors::sequential_extractor::SeqHandler; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") - .route(web::get().to(SeqHandler(get_features))) + .route(web::get().to(get_features)) .route(web::patch().to(SeqHandler(patch_features))), ); } @@ -27,12 +27,9 @@ async fn get_features( ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, Data, >, - req: HttpRequest, - analytics: Data, ) -> HttpResponse { let features = index_scheduler.features(); - analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req)); let features = features.runtime_features(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) @@ -53,6 +50,35 @@ pub struct RuntimeTogglableFeatures { pub contains_filter: Option, } +#[derive(Serialize)] +pub struct PatchExperimentalFeatureAnalytics { + vector_store: bool, + metrics: bool, + logs_route: bool, + edit_documents_by_function: bool, + contains_filter: bool, +} + +impl Aggregate for PatchExperimentalFeatureAnalytics { + fn event_name(&self) -> &'static str { + "Experimental features Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + vector_store: new.vector_store, + metrics: new.metrics, + logs_route: new.logs_route, + edit_documents_by_function: new.edit_documents_by_function, + contains_filter: new.contains_filter, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + async fn patch_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>, @@ -60,7 +86,7 @@ async fn patch_features( >, new_features: AwebJson, req: HttpRequest, - analytics: Data, + analytics: Data, ) -> Result { let features = index_scheduler.features(); debug!(parameters = ?new_features, "Patch features"); @@ -89,15 +115,14 @@ async fn patch_features( } = new_features; analytics.publish( - "Experimental features Updated".to_string(), - json!({ - "vector_store": vector_store, - "metrics": metrics, - "logs_route": logs_route, - "edit_documents_by_function": edit_documents_by_function, - "contains_filter": contains_filter, - }), - Some(&req), + PatchExperimentalFeatureAnalytics { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + }, + &req, ); index_scheduler.put_runtime_features(new_features)?; debug!(returns = ?new_features, "Patch features"); diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 85cf33c54..47f73ef42 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::io::ErrorKind; +use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -23,14 +25,14 @@ use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tracing::debug; -use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind}; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::error::MeilisearchHttpError; use crate::error::PayloadError::ReceivePayload; use crate::extractors::authentication::policies::*; @@ -41,7 +43,7 @@ use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; use crate::search::{parse_filter, RetrieveVectors}; -use crate::Opt; +use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -100,12 +102,84 @@ pub struct GetDocument { retrieve_vectors: Param, } +aggregate_methods!( + DocumentsGET => "Documents Fetched GET", + DocumentsPOST => "Documents Fetched POST", +); + +#[derive(Serialize)] +pub struct DocumentsFetchAggregator { + // a call on ../documents/:doc_id + per_document_id: bool, + // if a filter was used + per_filter: bool, + + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + + // pagination + #[serde(rename = "pagination.max_limit")] + max_limit: usize, + #[serde(rename = "pagination.max_offset")] + max_offset: usize, + + marker: std::marker::PhantomData, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum DocumentFetchKind { + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, +} + +impl DocumentsFetchAggregator { + pub fn from_query(query: &DocumentFetchKind) -> Self { + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } + }; + + Self { + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), + per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), + max_limit: limit, + max_offset: offset, + retrieve_vectors, + + marker: PhantomData, + } + } +} + +impl Aggregate for DocumentsFetchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + per_document_id: self.per_document_id | new.per_document_id, + per_filter: self.per_filter | new.per_filter, + retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, + max_limit: self.max_limit.max(new.max_limit), + max_offset: self.max_offset.max(new.max_offset), + marker: PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn get_document( index_scheduler: GuardedData, Data>, document_param: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = document_param.into_inner(); debug!(parameters = ?params, "Get document"); @@ -117,8 +191,15 @@ pub async fn get_document( let features = index_scheduler.features(); let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; - analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, + analytics.publish( + DocumentsFetchAggregator:: { + retrieve_vectors: param_retrieve_vectors.0, + per_document_id: true, + per_filter: false, + max_limit: 0, + max_offset: 0, + marker: PhantomData, + }, &req, ); @@ -129,17 +210,52 @@ pub async fn get_document( Ok(HttpResponse::Ok().json(document)) } +#[derive(Serialize)] +pub struct DocumentsDeletionAggregator { + per_document_id: bool, + clear_all: bool, + per_batch: bool, + per_filter: bool, +} + +impl Aggregate for DocumentsDeletionAggregator { + fn event_name(&self) -> &'static str { + "Documents Deleted" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + per_document_id: self.per_document_id | new.per_document_id, + clear_all: self.clear_all | new.clear_all, + per_batch: self.per_batch | new.per_batch, + per_filter: self.per_filter | new.per_filter, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = path.into_inner(); let index_uid = IndexUid::try_from(index_uid)?; - analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_document_id: true, + clear_all: false, + per_batch: false, + per_filter: false, + }, + &req, + ); let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), @@ -190,17 +306,19 @@ pub async fn documents_by_query_post( index_uid: web::Path, body: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); - analytics.post_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: body.filter.is_some(), - limit: body.limit, - offset: body.offset, + analytics.publish( + DocumentsFetchAggregator:: { + per_filter: body.filter.is_some(), retrieve_vectors: body.retrieve_vectors, + max_limit: body.limit, + max_offset: body.offset, + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -213,7 +331,7 @@ pub async fn get_documents( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Get documents GET"); @@ -235,12 +353,14 @@ pub async fn get_documents( filter, }; - analytics.get_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: query.filter.is_some(), - limit: query.limit, - offset: query.offset, + analytics.publish( + DocumentsFetchAggregator:: { + per_filter: query.filter.is_some(), retrieve_vectors: query.retrieve_vectors, + max_limit: query.limit, + max_offset: query.offset, + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -298,6 +418,39 @@ fn from_char_csv_delimiter( } } +aggregate_methods!( + Replaced => "Documents Added", + Updated => "Documents Updated", +); + +#[derive(Serialize)] +pub struct DocumentsAggregator { + payload_types: HashSet, + primary_key: HashSet, + index_creation: bool, + #[serde(skip)] + method: PhantomData, +} + +impl Aggregate for DocumentsAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + payload_types: self.payload_types.union(&new.payload_types).cloned().collect(), + primary_key: self.primary_key.union(&new.primary_key).cloned().collect(), + index_creation: self.index_creation | new.index_creation, + method: PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(self).unwrap_or_default() + } +} + pub async fn replace_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -305,16 +458,32 @@ pub async fn replace_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; debug!(parameters = ?params, "Replace documents"); let params = params.into_inner(); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, &req, ); @@ -346,16 +515,32 @@ pub async fn update_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let params = params.into_inner(); debug!(parameters = ?params, "Update documents"); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, &req, ); @@ -524,12 +709,20 @@ pub async fn delete_documents_batch( body: web::Json>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by batch"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::PerBatch, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_batch: true, + per_document_id: false, + clear_all: false, + per_filter: false, + }, + &req, + ); let ids = body .iter() @@ -562,14 +755,22 @@ pub async fn delete_documents_by_filter( body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by filter"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = index_uid.into_inner(); let filter = body.into_inner().filter; - analytics.delete_documents(DocumentDeletionKind::PerFilter, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_filter: true, + per_document_id: false, + clear_all: false, + per_batch: false, + }, + &req, + ); // we ensure the filter is well formed before enqueuing it crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())? @@ -599,13 +800,41 @@ pub struct DocumentEditionByFunction { pub function: String, } +#[derive(Serialize)] +struct EditDocumentsByFunctionAggregator { + // Set to true if at least one request was filtered + filtered: bool, + // Set to true if at least one request contained a context + with_context: bool, + + index_creation: bool, +} + +impl Aggregate for EditDocumentsByFunctionAggregator { + fn event_name(&self) -> &'static str { + "Documents Edited By Function" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + filtered: self.filtered | new.filtered, + with_context: self.with_context | new.with_context, + index_creation: self.index_creation | new.index_creation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn edit_documents_by_function( index_scheduler: GuardedData, Data>, index_uid: web::Path, params: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Edit documents by function"); @@ -617,9 +846,12 @@ pub async fn edit_documents_by_function( let index_uid = index_uid.into_inner(); let params = params.into_inner(); - analytics.update_documents_by_function( - ¶ms, - index_scheduler.index(&index_uid).is_err(), + analytics.publish( + EditDocumentsByFunctionAggregator { + filtered: params.filter.is_some(), + with_context: params.context.is_some(), + index_creation: index_scheduler.index(&index_uid).is_err(), + }, &req, ); @@ -670,10 +902,18 @@ pub async fn clear_all_documents( index_uid: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); + analytics.publish( + DocumentsDeletionAggregator { + clear_all: true, + per_document_id: false, + per_batch: false, + per_filter: false, + }, + &req, + ); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let uid = get_task_id(&req, &opt)?; diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 1df80711d..99a4a4f28 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -1,3 +1,5 @@ +use std::collections::{BinaryHeap, HashSet}; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -10,14 +12,15 @@ use meilisearch_types::locales::Locale; use serde_json::Value; use tracing::debug; -use crate::analytics::{Analytics, FacetSearchAggregator}; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy, + RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; @@ -53,20 +56,122 @@ pub struct FacetSearchQuery { pub locales: Option>, } +#[derive(Default)] +pub struct FacetSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // The set of all facetNames that were used + facet_names: HashSet, + + // As there been any other parameter than the facetName or facetQuery ones? + additional_search_parameters_provided: bool, +} + +impl FacetSearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &FacetSearchQuery) -> Self { + let FacetSearchQuery { + facet_query: _, + facet_name, + vector, + q, + filter, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + Self { + total_received: 1, + facet_names: Some(facet_name.clone()).into_iter().collect(), + additional_search_parameters_provided: q.is_some() + || vector.is_some() + || filter.is_some() + || *matching_strategy != MatchingStrategy::default() + || attributes_to_search_on.is_some() + || hybrid.is_some() + || ranking_score_threshold.is_some() + || locales.is_some(), + ..Default::default() + } + } + + pub fn succeed(&mut self, result: &FacetSearchResult) { + let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; + self.total_succeeded = 1; + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for FacetSearchAggregator { + fn event_name(&self) -> &'static str { + "Facet Searched POST" + } + + fn aggregate(mut self: Box, new: Box) -> Box { + for time in new.time_spent { + self.time_spent.push(time); + } + + Box::new(Self { + total_received: self.total_received.saturating_add(new.total_received), + total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded), + time_spent: self.time_spent, + facet_names: self.facet_names.union(&new.facet_names).cloned().collect(), + additional_search_parameters_provided: self.additional_search_parameters_provided + | new.additional_search_parameters_provided, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + facet_names, + additional_search_parameters_provided, + } = *self; + // the index of the 99th percentage of value + let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th as usize); + + serde_json::json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "facets": { + "total_distinct_facet_count": facet_names.len(), + "additional_search_parameters_provided": additional_search_parameters_provided, + }, + }) + } +} + pub async fn search( index_scheduler: GuardedData, Data>, search_queue: Data, index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.into_inner(); debug!(parameters = ?query, "Facet search"); - let mut aggregate = FacetSearchAggregator::from_query(&query, &req); + let mut aggregate = FacetSearchAggregator::from_query(&query); let facet_query = query.facet_query.clone(); let facet_name = query.facet_name.clone(); @@ -100,7 +205,7 @@ pub async fn search( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.post_facet_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 35b747ccf..7d073ec5f 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeSet; use std::convert::Infallible; use actix_web::web::Data; @@ -13,12 +14,11 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::{self, FieldDistribution, Index}; use meilisearch_types::tasks::KindWithContent; use serde::Serialize; -use serde_json::json; use time::OffsetDateTime; use tracing::debug; use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; @@ -28,8 +28,11 @@ use crate::Opt; pub mod documents; pub mod facet_search; pub mod search; +mod search_analytics; pub mod settings; +mod settings_analytics; pub mod similar; +mod similar_analytics; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( @@ -123,12 +126,31 @@ pub struct IndexCreateRequest { primary_key: Option, } +#[derive(Serialize)] +struct IndexCreatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexCreatedAggregate { + fn event_name(&self) -> &'static str { + "Index Created" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Create index"); let IndexCreateRequest { primary_key, uid } = body.into_inner(); @@ -136,9 +158,8 @@ pub async fn create_index( let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid); if allow_index_creation { analytics.publish( - "Index Created".to_string(), - json!({ "primary_key": primary_key }), - Some(&req), + IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() }, + &req, ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; @@ -194,21 +215,38 @@ pub async fn get_index( Ok(HttpResponse::Ok().json(index_view)) } +#[derive(Serialize)] +struct IndexUpdatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexUpdatedAggregate { + fn event_name(&self) -> &'static str { + "Index Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} pub async fn update_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Update index"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let body = body.into_inner(); analytics.publish( - "Index Updated".to_string(), - json!({ "primary_key": body.primary_key }), - Some(&req), + IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() }, + &req, ); let task = KindWithContent::IndexUpdate { diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 6a8eee521..2f5cb4a36 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -13,12 +13,13 @@ use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; -use crate::analytics::{Analytics, SearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST}; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, @@ -225,7 +226,7 @@ pub async fn search_with_url_query( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Search get"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -237,7 +238,7 @@ pub async fn search_with_url_query( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); @@ -254,7 +255,7 @@ pub async fn search_with_url_query( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.get_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; @@ -268,7 +269,7 @@ pub async fn search_with_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -280,7 +281,7 @@ pub async fn search_with_post( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; @@ -302,7 +303,7 @@ pub async fn search_with_post( MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } } - analytics.post_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs new file mode 100644 index 000000000..8bbb1781f --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -0,0 +1,485 @@ +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; +use std::collections::{BTreeSet, BinaryHeap, HashMap}; + +use meilisearch_types::locales::Locale; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{ + SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEMANTIC_RATIO, + }, +}; + +aggregate_methods!( + SearchGET => "Documents Searched GET", + SearchPOST => "Documents Searched POST", +); + +#[derive(Default)] +pub struct SearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + total_degraded: usize, + total_used_negative_operator: usize, + time_spent: BinaryHeap, + + // sort + sort_with_geo_point: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + sort_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + sort_total_number_of_criteria: usize, + + // distinct + distinct: bool, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // attributes_to_search_on + // every time a search is done using attributes_to_search_on + attributes_to_search_on_total_number_of_uses: usize, + + // q + // The maximum number of terms in a q request + max_terms_number: usize, + + // vector + // The maximum number of floats in a vector request + max_vector_size: usize, + // Whether the semantic ratio passed to a hybrid search equals the default ratio. + semantic_ratio: bool, + hybrid: bool, + retrieve_vectors: bool, + + // every time a search is done, we increment the counter linked to the used settings + matching_strategy: HashMap, + + // List of the unique Locales passed as parameter + locales: BTreeSet, + + // pagination + max_limit: usize, + max_offset: usize, + finite_pagination: usize, + + // formatting + max_attributes_to_retrieve: usize, + max_attributes_to_highlight: usize, + highlight_pre_tag: bool, + highlight_post_tag: bool, + max_attributes_to_crop: usize, + crop_marker: bool, + show_matches_position: bool, + crop_length: bool, + + // facets + facets_sum_of_terms: usize, + facets_total_number_of_facets: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SearchQuery) -> Self { + let SearchQuery { + q, + vector, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve: _, + retrieve_vectors, + attributes_to_crop: _, + crop_length, + attributes_to_highlight: _, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets: _, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref sort) = sort { + ret.sort_total_number_of_criteria = 1; + ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); + ret.sort_sum_of_criteria_terms = sort.len(); + } + + ret.distinct = distinct.is_some(); + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + // attributes_to_search_on + if attributes_to_search_on.is_some() { + ret.attributes_to_search_on_total_number_of_uses = 1; + } + + if let Some(ref q) = q { + ret.max_terms_number = q.split_whitespace().count(); + } + + if let Some(ref vector) = vector { + ret.max_vector_size = vector.len(); + } + ret.retrieve_vectors |= retrieve_vectors; + + if query.is_finite_pagination() { + let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); + ret.max_limit = limit; + ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; + ret.finite_pagination = 1; + } else { + ret.max_limit = *limit; + ret.max_offset = *offset; + ret.finite_pagination = 0; + } + + ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); + + if let Some(locales) = locales { + ret.locales = locales.iter().copied().collect(); + } + + ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); + ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); + ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); + ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); + ret.show_matches_position = *show_matches_position; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + if let Some(hybrid) = hybrid { + ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); + ret.hybrid = true; + } + + ret + } + + pub fn succeed(&mut self, result: &SearchResult) { + let SearchResult { + hits: _, + query: _, + processing_time_ms, + hits_info: _, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + degraded, + used_negative_operator, + } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + if *degraded { + self.total_degraded = self.total_degraded.saturating_add(1); + } + if *used_negative_operator { + self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); + } + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SearchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + mut locales, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_degraded = self.total_degraded.saturating_add(total_degraded); + self.total_used_negative_operator = + self.total_used_negative_operator.saturating_add(total_used_negative_operator); + self.time_spent.append(&mut time_spent); + + // sort + self.sort_with_geo_point |= sort_with_geo_point; + self.sort_sum_of_criteria_terms = + self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); + self.sort_total_number_of_criteria = + self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + + // distinct + self.distinct |= distinct; + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + // attributes_to_search_on + self.attributes_to_search_on_total_number_of_uses = self + .attributes_to_search_on_total_number_of_uses + .saturating_add(attributes_to_search_on_total_number_of_uses); + + // q + self.max_terms_number = self.max_terms_number.max(max_terms_number); + + // vector + self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; + self.semantic_ratio |= semantic_ratio; + self.hybrid |= hybrid; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + self.finite_pagination += finite_pagination; + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + self.max_attributes_to_highlight = + self.max_attributes_to_highlight.max(max_attributes_to_highlight); + self.highlight_pre_tag |= highlight_pre_tag; + self.highlight_post_tag |= highlight_post_tag; + self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); + self.crop_marker |= crop_marker; + self.show_matches_position |= show_matches_position; + self.crop_length |= crop_length; + + // facets + self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); + self.facets_total_number_of_facets = + self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); + + // matching strategy + for (key, value) in matching_strategy.into_iter() { + let matching_strategy = self.matching_strategy.entry(key).or_insert(0); + *matching_strategy = matching_strategy.saturating_add(value); + } + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + // locales + self.locales.append(&mut locales); + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + locales, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, + }, + "sort": { + "with_geoPoint": sort_with_geo_point, + "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), + }, + "distinct": distinct, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "attributes_to_search_on": { + "total_number_of_uses": attributes_to_search_on_total_number_of_uses, + }, + "q": { + "max_terms_number": max_terms_number, + }, + "vector": { + "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, + }, + "hybrid": { + "enabled": hybrid, + "semantic_ratio": semantic_ratio, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + "max_attributes_to_highlight": max_attributes_to_highlight, + "highlight_pre_tag": highlight_pre_tag, + "highlight_post_tag": highlight_post_tag, + "max_attributes_to_crop": max_attributes_to_crop, + "crop_marker": crop_marker, + "show_matches_position": show_matches_position, + "crop_length": crop_length, + }, + "facets": { + "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), + }, + "matching_strategy": { + "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "locales": locales, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + }, + }) + } +} diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index aaf8673d0..bca763a99 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -1,15 +1,14 @@ +use super::settings_analytics::*; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; -use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::Setting; -use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked}; +use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -20,7 +19,7 @@ use crate::Opt; #[macro_export] macro_rules! make_setting_route { - ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => { + ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { pub mod $attr { use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse, Resource}; @@ -80,7 +79,7 @@ macro_rules! make_setting_route { body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, opt: web::Data, - $analytics_var: web::Data, + analytics: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -88,7 +87,10 @@ macro_rules! make_setting_route { debug!(parameters = ?body, "Update settings"); #[allow(clippy::redundant_closure_call)] - $analytics(&body, &req); + analytics.publish( + $crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(), + &req, + ); let new_settings = Settings { $attr: match body { @@ -160,21 +162,7 @@ make_setting_route!( >, filterable_attributes, "filterableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "FilterableAttributes Updated".to_string(), - json!({ - "filterable_attributes": { - "total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0), - "has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false), - } - }), - Some(req), - ); - } + FilterableAttributesAnalytics ); make_setting_route!( @@ -186,21 +174,7 @@ make_setting_route!( >, sortable_attributes, "sortableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "SortableAttributes Updated".to_string(), - json!({ - "sortable_attributes": { - "total": setting.as_ref().map(|sort| sort.len()), - "has_geo": setting.as_ref().map(|sort| sort.contains("_geo")), - }, - }), - Some(req), - ); - } + SortableAttributesAnalytics ); make_setting_route!( @@ -212,21 +186,7 @@ make_setting_route!( >, displayed_attributes, "displayedAttributes", - analytics, - |displayed: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "DisplayedAttributes Updated".to_string(), - json!({ - "displayed_attributes": { - "total": displayed.as_ref().map(|displayed| displayed.len()), - "with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - }), - Some(req), - ); - } + DisplayedAttributesAnalytics ); make_setting_route!( @@ -238,40 +198,7 @@ make_setting_route!( >, typo_tolerance, "typoTolerance", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "TypoTolerance Updated".to_string(), - json!({ - "typo_tolerance": { - "enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), - "disable_on_attributes": setting - .as_ref() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": setting - .as_ref() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - }), - Some(req), - ); - } + TypoToleranceAnalytics ); make_setting_route!( @@ -283,21 +210,7 @@ make_setting_route!( >, searchable_attributes, "searchableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "SearchableAttributes Updated".to_string(), - json!({ - "searchable_attributes": { - "total": setting.as_ref().map(|searchable| searchable.len()), - "with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - }), - Some(req), - ); - } + SearchableAttributesAnalytics ); make_setting_route!( @@ -309,20 +222,7 @@ make_setting_route!( >, stop_words, "stopWords", - analytics, - |stop_words: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "StopWords Updated".to_string(), - json!({ - "stop_words": { - "total": stop_words.as_ref().map(|stop_words| stop_words.len()), - }, - }), - Some(req), - ); - } + StopWordsAnalytics ); make_setting_route!( @@ -334,20 +234,7 @@ make_setting_route!( >, non_separator_tokens, "nonSeparatorTokens", - analytics, - |non_separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "nonSeparatorTokens Updated".to_string(), - json!({ - "non_separator_tokens": { - "total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()), - }, - }), - Some(req), - ); - } + NonSeparatorTokensAnalytics ); make_setting_route!( @@ -359,20 +246,7 @@ make_setting_route!( >, separator_tokens, "separatorTokens", - analytics, - |separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "separatorTokens Updated".to_string(), - json!({ - "separator_tokens": { - "total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()), - }, - }), - Some(req), - ); - } + SeparatorTokensAnalytics ); make_setting_route!( @@ -384,20 +258,7 @@ make_setting_route!( >, dictionary, "dictionary", - analytics, - |dictionary: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "dictionary Updated".to_string(), - json!({ - "dictionary": { - "total": dictionary.as_ref().map(|dictionary| dictionary.len()), - }, - }), - Some(req), - ); - } + DictionaryAnalytics ); make_setting_route!( @@ -409,20 +270,7 @@ make_setting_route!( >, synonyms, "synonyms", - analytics, - |synonyms: &Option>>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "Synonyms Updated".to_string(), - json!({ - "synonyms": { - "total": synonyms.as_ref().map(|synonyms| synonyms.len()), - }, - }), - Some(req), - ); - } + SynonymsAnalytics ); make_setting_route!( @@ -434,19 +282,7 @@ make_setting_route!( >, distinct_attribute, "distinctAttribute", - analytics, - |distinct: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "DistinctAttribute Updated".to_string(), - json!({ - "distinct_attribute": { - "set": distinct.is_some(), - } - }), - Some(req), - ); - } + DistinctAttributeAnalytics ); make_setting_route!( @@ -458,20 +294,7 @@ make_setting_route!( >, proximity_precision, "proximityPrecision", - analytics, - |precision: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "ProximityPrecision Updated".to_string(), - json!({ - "proximity_precision": { - "set": precision.is_some(), - "value": precision.unwrap_or_default(), - } - }), - Some(req), - ); - } + ProximityPrecisionAnalytics ); make_setting_route!( @@ -483,17 +306,7 @@ make_setting_route!( >, localized_attributes, "localizedAttributes", - analytics, - |rules: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "LocalizedAttributesRules Updated".to_string(), - json!({ - "locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()) - }), - Some(req), - ); - } + LocalesAnalytics ); make_setting_route!( @@ -505,26 +318,7 @@ make_setting_route!( >, ranking_rules, "rankingRules", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "RankingRules Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))), - "typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))), - "proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))), - "attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))), - "sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))), - "exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))), - "values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - } - }), - Some(req), - ); - } + RankingRulesAnalytics ); make_setting_route!( @@ -536,25 +330,7 @@ make_setting_route!( >, faceting, "faceting", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - use meilisearch_types::facet_values_sort::FacetValuesSort; - - analytics.publish( - "Faceting Updated".to_string(), - json!({ - "faceting": { - "max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()), - "sort_facet_values_by_star_count": setting.as_ref().and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - }), - Some(req), - ); - } + FacetingAnalytics ); make_setting_route!( @@ -566,20 +342,7 @@ make_setting_route!( >, pagination, "pagination", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "Pagination Updated".to_string(), - json!({ - "pagination": { - "max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()), - }, - }), - Some(req), - ); - } + PaginationAnalytics ); make_setting_route!( @@ -591,75 +354,9 @@ make_setting_route!( >, embedders, "embedders", - analytics, - |setting: &Option>>, req: &HttpRequest| { - - - analytics.publish( - "Embedders Updated".to_string(), - serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}), - Some(req), - ); - } + EmbeddersAnalytics ); -fn embedder_analytics( - setting: Option< - &std::collections::BTreeMap< - String, - Setting, - >, - >, -) -> serde_json::Value { - let mut sources = std::collections::HashSet::new(); - - if let Some(s) = &setting { - for source in s - .values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.source.set()) - { - use meilisearch_types::milli::vector::settings::EmbedderSource; - match source { - EmbedderSource::OpenAi => sources.insert("openAi"), - EmbedderSource::HuggingFace => sources.insert("huggingFace"), - EmbedderSource::UserProvided => sources.insert("userProvided"), - EmbedderSource::Ollama => sources.insert("ollama"), - EmbedderSource::Rest => sources.insert("rest"), - }; - } - }; - - let document_template_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.document_template.set().is_some()) - }); - - let document_template_max_bytes = setting.as_ref().and_then(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.document_template_max_bytes.set()) - .max() - }); - - let binary_quantization_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.binary_quantized.set().is_some()) - }); - - json!( - { - "total": setting.as_ref().map(|s| s.len()), - "sources": sources, - "document_template_used": document_template_used, - "document_template_max_bytes": document_template_max_bytes, - "binary_quantization_used": binary_quantization_used, - } - ) -} - make_setting_route!( "/search-cutoff-ms", put, @@ -669,14 +366,7 @@ make_setting_route!( >, search_cutoff_ms, "searchCutoffMs", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - "Search Cutoff Updated".to_string(), - serde_json::json!({"search_cutoff_ms": setting }), - Some(req), - ); - } + SearchCutoffMsAnalytics ); macro_rules! generate_configure { @@ -720,7 +410,7 @@ pub async fn update_all( body: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -729,104 +419,45 @@ pub async fn update_all( let new_settings = validate_settings(new_settings, &index_scheduler)?; analytics.publish( - "Settings Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))), - "typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))), - "proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))), - "attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))), - "sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))), - "exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))), - "values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - }, - "searchable_attributes": { - "total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()), - "with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - "displayed_attributes": { - "total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()), - "with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - "sortable_attributes": { - "total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()), - "has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")), - }, - "filterable_attributes": { - "total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()), - "has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")), - }, - "distinct_attribute": { - "set": new_settings.distinct_attribute.as_ref().set().is_some() - }, - "proximity_precision": { - "set": new_settings.proximity_precision.as_ref().set().is_some(), - "value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default() - }, - "typo_tolerance": { - "enabled": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.enabled.as_ref().set()) - .copied(), - "disable_on_attributes": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - "faceting": { - "max_values_per_facet": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.max_values_per_facet.as_ref().set()), - "sort_facet_values_by_star_count": new_settings.faceting - .as_ref() - .set() - .and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - "pagination": { - "max_total_hits": new_settings.pagination - .as_ref() - .set() - .and_then(|s| s.max_total_hits.as_ref().set()), - }, - "stop_words": { - "total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()), - }, - "synonyms": { - "total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()), - }, - "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), - "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), - "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()), - }), - Some(&req), + SettingsAnalytics { + ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()), + searchable_attributes: SearchableAttributesAnalytics::new( + new_settings.searchable_attributes.as_ref().set(), + ), + displayed_attributes: DisplayedAttributesAnalytics::new( + new_settings.displayed_attributes.as_ref().set(), + ), + sortable_attributes: SortableAttributesAnalytics::new( + new_settings.sortable_attributes.as_ref().set(), + ), + filterable_attributes: FilterableAttributesAnalytics::new( + new_settings.filterable_attributes.as_ref().set(), + ), + distinct_attribute: DistinctAttributeAnalytics::new( + new_settings.distinct_attribute.as_ref().set(), + ), + proximity_precision: ProximityPrecisionAnalytics::new( + new_settings.proximity_precision.as_ref().set(), + ), + typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()), + faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()), + pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()), + stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()), + synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()), + embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()), + search_cutoff_ms: SearchCutoffMsAnalytics::new( + new_settings.search_cutoff_ms.as_ref().set(), + ), + locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()), + dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()), + separator_tokens: SeparatorTokensAnalytics::new( + new_settings.separator_tokens.as_ref().set(), + ), + non_separator_tokens: NonSeparatorTokensAnalytics::new( + new_settings.non_separator_tokens.as_ref().set(), + ), + }, + &req, ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs new file mode 100644 index 000000000..de01b72e8 --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -0,0 +1,621 @@ +//! All the structures used to make the analytics on the settings works. +//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received +//! through the sub-settings route directly without any manipulation. +//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`. + +use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::settings::{ + FacetingSettings, PaginationSettings, ProximityPrecisionView, TypoSettings, +}; +use meilisearch_types::{facet_values_sort::FacetValuesSort, settings::RankingRuleView}; +use serde::Serialize; +use std::collections::{BTreeMap, BTreeSet, HashSet}; + +use crate::analytics::Aggregate; + +#[derive(Serialize, Default)] +pub struct SettingsAnalytics { + pub ranking_rules: RankingRulesAnalytics, + pub searchable_attributes: SearchableAttributesAnalytics, + pub displayed_attributes: DisplayedAttributesAnalytics, + pub sortable_attributes: SortableAttributesAnalytics, + pub filterable_attributes: FilterableAttributesAnalytics, + pub distinct_attribute: DistinctAttributeAnalytics, + pub proximity_precision: ProximityPrecisionAnalytics, + pub typo_tolerance: TypoToleranceAnalytics, + pub faceting: FacetingAnalytics, + pub pagination: PaginationAnalytics, + pub stop_words: StopWordsAnalytics, + pub synonyms: SynonymsAnalytics, + pub embedders: EmbeddersAnalytics, + pub search_cutoff_ms: SearchCutoffMsAnalytics, + pub locales: LocalesAnalytics, + pub dictionary: DictionaryAnalytics, + pub separator_tokens: SeparatorTokensAnalytics, + pub non_separator_tokens: NonSeparatorTokensAnalytics, +} + +impl Aggregate for SettingsAnalytics { + fn event_name(&self) -> &'static str { + "Settings Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + ranking_rules: RankingRulesAnalytics { + words_position: new + .ranking_rules + .words_position + .or(self.ranking_rules.words_position), + typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position), + proximity_position: new + .ranking_rules + .proximity_position + .or(self.ranking_rules.proximity_position), + attribute_position: new + .ranking_rules + .attribute_position + .or(self.ranking_rules.attribute_position), + sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position), + exactness_position: new + .ranking_rules + .exactness_position + .or(self.ranking_rules.exactness_position), + values: new.ranking_rules.values.or(self.ranking_rules.values), + }, + searchable_attributes: SearchableAttributesAnalytics { + total: new.searchable_attributes.total.or(self.searchable_attributes.total), + with_wildcard: new + .searchable_attributes + .with_wildcard + .or(self.searchable_attributes.with_wildcard), + }, + displayed_attributes: DisplayedAttributesAnalytics { + total: new.displayed_attributes.total.or(self.displayed_attributes.total), + with_wildcard: new + .displayed_attributes + .with_wildcard + .or(self.displayed_attributes.with_wildcard), + }, + sortable_attributes: SortableAttributesAnalytics { + total: new.sortable_attributes.total.or(self.sortable_attributes.total), + has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo), + }, + filterable_attributes: FilterableAttributesAnalytics { + total: new.filterable_attributes.total.or(self.filterable_attributes.total), + has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo), + }, + distinct_attribute: DistinctAttributeAnalytics { + set: self.distinct_attribute.set | new.distinct_attribute.set, + }, + proximity_precision: ProximityPrecisionAnalytics { + set: self.proximity_precision.set | new.proximity_precision.set, + value: new.proximity_precision.value.or(self.proximity_precision.value), + }, + typo_tolerance: TypoToleranceAnalytics { + enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled), + disable_on_attributes: new + .typo_tolerance + .disable_on_attributes + .or(self.typo_tolerance.disable_on_attributes), + disable_on_words: new + .typo_tolerance + .disable_on_words + .or(self.typo_tolerance.disable_on_words), + min_word_size_for_one_typo: new + .typo_tolerance + .min_word_size_for_one_typo + .or(self.typo_tolerance.min_word_size_for_one_typo), + min_word_size_for_two_typos: new + .typo_tolerance + .min_word_size_for_two_typos + .or(self.typo_tolerance.min_word_size_for_two_typos), + }, + faceting: FacetingAnalytics { + max_values_per_facet: new + .faceting + .max_values_per_facet + .or(self.faceting.max_values_per_facet), + sort_facet_values_by_star_count: new + .faceting + .sort_facet_values_by_star_count + .or(self.faceting.sort_facet_values_by_star_count), + sort_facet_values_by_total: new + .faceting + .sort_facet_values_by_total + .or(self.faceting.sort_facet_values_by_total), + }, + pagination: PaginationAnalytics { + max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits), + }, + stop_words: StopWordsAnalytics { + total: new.stop_words.total.or(self.stop_words.total), + }, + synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) }, + embedders: EmbeddersAnalytics { + total: new.embedders.total.or(self.embedders.total), + sources: match (self.embedders.sources, new.embedders.sources) { + (None, None) => None, + (Some(sources), None) | (None, Some(sources)) => Some(sources), + (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), + }, + document_template_used: match ( + self.embedders.document_template_used, + new.embedders.document_template_used, + ) { + (None, None) => None, + (Some(used), None) | (None, Some(used)) => Some(used), + (Some(this), Some(other)) => Some(this | other), + }, + document_template_max_bytes: match ( + self.embedders.document_template_max_bytes, + new.embedders.document_template_max_bytes, + ) { + (None, None) => None, + (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), + (Some(this), Some(other)) => Some(this.max(other)), + }, + binary_quantization_used: match ( + self.embedders.binary_quantization_used, + new.embedders.binary_quantization_used, + ) { + (None, None) => None, + (Some(bq), None) | (None, Some(bq)) => Some(bq), + (Some(this), Some(other)) => Some(this | other), + }, + }, + search_cutoff_ms: SearchCutoffMsAnalytics { + search_cutoff_ms: new + .search_cutoff_ms + .search_cutoff_ms + .or(self.search_cutoff_ms.search_cutoff_ms), + }, + locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) }, + dictionary: DictionaryAnalytics { + total: new.dictionary.total.or(self.dictionary.total), + }, + separator_tokens: SeparatorTokensAnalytics { + total: new.non_separator_tokens.total.or(self.separator_tokens.total), + }, + non_separator_tokens: NonSeparatorTokensAnalytics { + total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), + }, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + +#[derive(Serialize, Default)] +pub struct RankingRulesAnalytics { + pub words_position: Option, + pub typo_position: Option, + pub proximity_position: Option, + pub attribute_position: Option, + pub sort_position: Option, + pub exactness_position: Option, + pub values: Option, +} + +impl RankingRulesAnalytics { + pub fn new(rr: Option<&Vec>) -> Self { + RankingRulesAnalytics { + words_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) + }), + typo_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) + }), + proximity_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + }) + }), + attribute_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + }) + }), + sort_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) + }), + exactness_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + }) + }), + values: rr.as_ref().map(|rr| { + rr.iter() + .filter(|s| { + matches!( + s, + meilisearch_types::settings::RankingRuleView::Asc(_) + | meilisearch_types::settings::RankingRuleView::Desc(_) + ) + }) + .map(|x| x.to_string()) + .collect::>() + .join(", ") + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { ranking_rules: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SearchableAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl SearchableAttributesAnalytics { + pub fn new(setting: Option<&Vec>) -> Self { + Self { + total: setting.as_ref().map(|searchable| searchable.len()), + with_wildcard: setting + .as_ref() + .map(|searchable| searchable.iter().any(|searchable| searchable == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { searchable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DisplayedAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl DisplayedAttributesAnalytics { + pub fn new(displayed: Option<&Vec>) -> Self { + Self { + total: displayed.as_ref().map(|displayed| displayed.len()), + with_wildcard: displayed + .as_ref() + .map(|displayed| displayed.iter().any(|displayed| displayed == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { displayed_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SortableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl SortableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|sort| sort.len()), + has_geo: setting.as_ref().map(|sort| sort.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { sortable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FilterableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl FilterableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|filter| filter.len()), + has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { filterable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DistinctAttributeAnalytics { + pub set: bool, +} + +impl DistinctAttributeAnalytics { + pub fn new(distinct: Option<&String>) -> Self { + Self { set: distinct.is_some() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { distinct_attribute: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct ProximityPrecisionAnalytics { + pub set: bool, + pub value: Option, +} + +impl ProximityPrecisionAnalytics { + pub fn new(precision: Option<&ProximityPrecisionView>) -> Self { + Self { set: precision.is_some(), value: precision.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { proximity_precision: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct TypoToleranceAnalytics { + pub enabled: Option, + pub disable_on_attributes: Option, + pub disable_on_words: Option, + pub min_word_size_for_one_typo: Option, + pub min_word_size_for_two_typos: Option, +} + +impl TypoToleranceAnalytics { + pub fn new(setting: Option<&TypoSettings>) -> Self { + Self { + enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), + disable_on_attributes: setting + .as_ref() + .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), + disable_on_words: setting + .as_ref() + .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), + min_word_size_for_one_typo: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set())) + .flatten(), + min_word_size_for_two_typos: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set())) + .flatten(), + } + } + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { typo_tolerance: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FacetingAnalytics { + pub max_values_per_facet: Option, + pub sort_facet_values_by_star_count: Option, + pub sort_facet_values_by_total: Option, +} + +impl FacetingAnalytics { + pub fn new(setting: Option<&FacetingSettings>) -> Self { + Self { + max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()), + sort_facet_values_by_star_count: setting.as_ref().and_then(|s| { + s.sort_facet_values_by + .as_ref() + .set() + .map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) + }), + sort_facet_values_by_total: setting + .as_ref() + .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { faceting: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PaginationAnalytics { + pub max_total_hits: Option, +} + +impl PaginationAnalytics { + pub fn new(setting: Option<&PaginationSettings>) -> Self { + Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { pagination: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct StopWordsAnalytics { + pub total: Option, +} + +impl StopWordsAnalytics { + pub fn new(stop_words: Option<&BTreeSet>) -> Self { + Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { stop_words: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SynonymsAnalytics { + pub total: Option, +} + +impl SynonymsAnalytics { + pub fn new(synonyms: Option<&BTreeMap>>) -> Self { + Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { synonyms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct EmbeddersAnalytics { + // last + pub total: Option, + // Merge the sources + pub sources: Option>, + // |= + pub document_template_used: Option, + // max + pub document_template_max_bytes: Option, + // |= + pub binary_quantization_used: Option, +} + +impl EmbeddersAnalytics { + pub fn new(setting: Option<&BTreeMap>>) -> Self { + let mut sources = std::collections::HashSet::new(); + + if let Some(s) = &setting { + for source in s + .values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.source.set()) + { + use meilisearch_types::milli::vector::settings::EmbedderSource; + match source { + EmbedderSource::OpenAi => sources.insert("openAi".to_string()), + EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()), + EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), + EmbedderSource::Ollama => sources.insert("ollama".to_string()), + EmbedderSource::Rest => sources.insert("rest".to_string()), + }; + } + }; + + Self { + total: setting.as_ref().map(|s| s.len()), + sources: Some(sources), + document_template_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.document_template.set().is_some()) + }), + document_template_max_bytes: setting.as_ref().and_then(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.document_template_max_bytes.set()) + .max() + }), + binary_quantization_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.binary_quantized.set().is_some()) + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { embedders: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct SearchCutoffMsAnalytics { + pub search_cutoff_ms: Option, +} + +impl SearchCutoffMsAnalytics { + pub fn new(setting: Option<&u64>) -> Self { + Self { search_cutoff_ms: setting.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct LocalesAnalytics { + pub locales: Option>, +} + +impl LocalesAnalytics { + pub fn new(rules: Option<&Vec>) -> Self { + LocalesAnalytics { + locales: rules.as_ref().map(|rules| { + rules + .iter() + .flat_map(|rule| rule.locales.iter().cloned()) + .collect::>() + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { locales: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DictionaryAnalytics { + pub total: Option, +} + +impl DictionaryAnalytics { + pub fn new(dictionary: Option<&BTreeSet>) -> Self { + Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { dictionary: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SeparatorTokensAnalytics { + pub total: Option, +} + +impl SeparatorTokensAnalytics { + pub fn new(separator_tokens: Option<&BTreeSet>) -> Self { + Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { separator_tokens: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct NonSeparatorTokensAnalytics { + pub total: Option, +} + +impl NonSeparatorTokensAnalytics { + pub fn new(non_separator_tokens: Option<&BTreeSet>) -> Self { + Self { + total: non_separator_tokens + .as_ref() + .map(|non_separator_tokens| non_separator_tokens.len()), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { non_separator_tokens: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index f94a02987..79f42f0aa 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -13,9 +13,10 @@ use serde_json::Value; use tracing::debug; use super::ActionPolicy; -use crate::analytics::{Analytics, SimilarAggregator}; +use crate::analytics::Analytics; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST}; use crate::search::{ add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, @@ -34,13 +35,13 @@ pub async fn similar_get( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.0.try_into()?; - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); debug!(parameters = ?query, "Similar get"); @@ -49,7 +50,7 @@ pub async fn similar_get( if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.get_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; @@ -62,21 +63,21 @@ pub async fn similar_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.into_inner(); debug!(parameters = ?query, "Similar post"); - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); let similar = similar(index_scheduler, index_uid, query).await; if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.post_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; diff --git a/crates/meilisearch/src/routes/indexes/similar_analytics.rs b/crates/meilisearch/src/routes/indexes/similar_analytics.rs new file mode 100644 index 000000000..69685a56c --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/similar_analytics.rs @@ -0,0 +1,235 @@ +use std::collections::{BinaryHeap, HashMap}; + +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{SimilarQuery, SimilarResult}, +}; + +aggregate_methods!( + SimilarPOST => "Similar POST", + SimilarGET => "Similar GET", +); + +#[derive(Default)] +pub struct SimilarAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // Whether a non-default embedder was specified + retrieve_vectors: bool, + + // pagination + max_limit: usize, + max_offset: usize, + + // formatting + max_attributes_to_retrieve: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SimilarAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SimilarQuery) -> Self { + let SimilarQuery { + id: _, + embedder: _, + offset, + limit, + attributes_to_retrieve: _, + retrieve_vectors, + show_ranking_score, + show_ranking_score_details, + filter, + ranking_score_threshold, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + ret.max_limit = *limit; + ret.max_offset = *offset; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + ret.retrieve_vectors = *retrieve_vectors; + + ret + } + + pub fn succeed(&mut self, result: &SimilarResult) { + let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SimilarAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + /// Aggregate one [SimilarAggregator] into another. + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.time_spent.append(&mut time_spent); + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + self.retrieve_vectors |= retrieve_vectors; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + } + }) + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index c25aeee70..b7260ea08 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -25,6 +25,7 @@ pub mod indexes; mod logs; mod metrics; mod multi_search; +mod multi_search_analytics; mod snapshot; mod swap_indexes; pub mod tasks; diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index 5fcb868c6..b7bd31716 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -9,7 +9,7 @@ use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; -use crate::analytics::{Analytics, MultiSearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -21,6 +21,8 @@ use crate::search::{ }; use crate::search_queue::SearchQueue; +use super::multi_search_analytics::MultiSearchAggregator; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post)))); } @@ -35,7 +37,7 @@ pub async fn multi_search_with_post( search_queue: Data, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { // Since we don't want to process half of the search requests and then get a permit refused // we're going to get one permit for the whole duration of the multi-search request. @@ -43,7 +45,7 @@ pub async fn multi_search_with_post( let federated_search = params.into_inner(); - let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req); + let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search); let FederatedSearch { mut queries, federation } = federated_search; @@ -87,7 +89,7 @@ pub async fn multi_search_with_post( multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); HttpResponse::Ok().json(search_result??) } None => { @@ -149,7 +151,7 @@ pub async fn multi_search_with_post( if search_results.is_ok() { multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); let search_results = search_results.map_err(|(mut err, query_index)| { // Add the query index that failed as context for the error message. diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs new file mode 100644 index 000000000..be1218399 --- /dev/null +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -0,0 +1,170 @@ +use std::collections::HashSet; + +use serde_json::json; + +use crate::{ + analytics::Aggregate, + search::{FederatedSearch, SearchQueryWithIndex}, +}; + +#[derive(Default)] +pub struct MultiSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + + // sum of the number of distinct indexes in each single request, use with total_received to compute an avg + total_distinct_index_count: usize, + // number of queries with a single index, use with total_received to compute a proportion + total_single_index: usize, + + // sum of the number of search queries in the requests, use with total_received to compute an average + total_search_count: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + + // federation + use_federation: bool, +} + +impl MultiSearchAggregator { + pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { + let use_federation = federated_search.federation.is_some(); + + let distinct_indexes: HashSet<_> = federated_search + .queries + .iter() + .map(|query| { + let query = &query; + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + let SearchQueryWithIndex { + index_uid, + federation_options: _, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } = query; + + index_uid.as_str() + }) + .collect(); + + let show_ranking_score = + federated_search.queries.iter().any(|query| query.show_ranking_score); + let show_ranking_score_details = + federated_search.queries.iter().any(|query| query.show_ranking_score_details); + + Self { + total_received: 1, + total_succeeded: 0, + total_distinct_index_count: distinct_indexes.len(), + total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, + total_search_count: federated_search.queries.len(), + show_ranking_score, + show_ranking_score_details, + use_federation, + } + } + + pub fn succeed(&mut self) { + self.total_succeeded = self.total_succeeded.saturating_add(1); + } +} + +impl Aggregate for MultiSearchAggregator { + fn event_name(&self) -> &'static str { + "Documents Searched by Multi-Search POST" + } + + /// Aggregate one [MultiSearchAggregator] into another. + fn aggregate(self: Box, new: Box) -> Box { + // write the aggregate in a way that will cause a compilation error if a field is added. + + // get ownership of self, replacing it by a default value. + let this = *self; + + let total_received = this.total_received.saturating_add(new.total_received); + let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); + let total_distinct_index_count = + this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_single_index = this.total_single_index.saturating_add(new.total_single_index); + let total_search_count = this.total_search_count.saturating_add(new.total_search_count); + let show_ranking_score = this.show_ranking_score || new.show_ranking_score; + let show_ranking_score_details = + this.show_ranking_score_details || new.show_ranking_score_details; + let use_federation = this.use_federation || new.use_federation; + + Box::new(Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + } = *self; + + json!({ + "requests": { + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "indexes": { + "total_single_index": total_single_index, + "total_distinct_index_count": total_distinct_index_count, + "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early + }, + "searches": { + "total_search_count": total_search_count, + "avg_search_count": (total_search_count as f64) / (total_received as f64), + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + }, + "federation": { + "use_federation": use_federation, + } + }) + } +} diff --git a/crates/meilisearch/src/routes/snapshot.rs b/crates/meilisearch/src/routes/snapshot.rs index 84673729f..cacbc41af 100644 --- a/crates/meilisearch/src/routes/snapshot.rs +++ b/crates/meilisearch/src/routes/snapshot.rs @@ -3,7 +3,6 @@ use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -17,13 +16,15 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); } +crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created"); + pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); + analytics.publish(SnapshotAnalytics::default(), &req); let task = KindWithContent::SnapshotCreation; let uid = get_task_id(&req, &opt)?; diff --git a/crates/meilisearch/src/routes/swap_indexes.rs b/crates/meilisearch/src/routes/swap_indexes.rs index 51a7b0707..9b8b67e63 100644 --- a/crates/meilisearch/src/routes/swap_indexes.rs +++ b/crates/meilisearch/src/routes/swap_indexes.rs @@ -8,10 +8,10 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; -use serde_json::json; +use serde::Serialize; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -29,21 +29,36 @@ pub struct SwapIndexesPayload { indexes: Vec, } +#[derive(Serialize)] +struct IndexSwappedAnalytics { + swap_operation_number: usize, +} + +impl Aggregate for IndexSwappedAnalytics { + fn event_name(&self) -> &'static str { + "Indexes Swapped" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + swap_operation_number: self.swap_operation_number.max(new.swap_operation_number), + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); - analytics.publish( - "Indexes Swapped".to_string(), - json!({ - "swap_operation_number": params.len(), - }), - Some(&req), - ); + analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req); let filters = index_scheduler.filters(); let mut swaps = vec![]; diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index 02f009ff7..95959d6d5 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -12,18 +12,17 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; -use serde_json::json; use time::format_description::well_known::Rfc3339; use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::Opt; +use crate::{aggregate_methods, Opt}; const DEFAULT_LIMIT: u32 = 20; @@ -158,12 +157,69 @@ impl TaskDeletionOrCancelationQuery { } } +aggregate_methods!( + CancelTasks => "Tasks Canceled", + DeleteTasks => "Tasks Deleted", +); + +#[derive(Serialize)] +struct TaskFilterAnalytics { + filtered_by_uid: bool, + filtered_by_index_uid: bool, + filtered_by_type: bool, + filtered_by_status: bool, + filtered_by_canceled_by: bool, + filtered_by_before_enqueued_at: bool, + filtered_by_after_enqueued_at: bool, + filtered_by_before_started_at: bool, + filtered_by_after_started_at: bool, + filtered_by_before_finished_at: bool, + filtered_by_after_finished_at: bool, + + #[serde(skip)] + marker: std::marker::PhantomData, +} + +impl Aggregate for TaskFilterAnalytics { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid, + filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid, + filtered_by_type: self.filtered_by_type | new.filtered_by_type, + filtered_by_status: self.filtered_by_status | new.filtered_by_status, + filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by, + filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at + | new.filtered_by_before_enqueued_at, + filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at + | new.filtered_by_after_enqueued_at, + filtered_by_before_started_at: self.filtered_by_before_started_at + | new.filtered_by_before_started_at, + filtered_by_after_started_at: self.filtered_by_after_started_at + | new.filtered_by_after_started_at, + filtered_by_before_finished_at: self.filtered_by_before_finished_at + | new.filtered_by_before_finished_at, + filtered_by_after_finished_at: self.filtered_by_after_finished_at + | new.filtered_by_after_finished_at, + + marker: std::marker::PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + async fn cancel_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -172,21 +228,22 @@ async fn cancel_tasks( } analytics.publish( - "Tasks Canceled".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); let query = params.into_query(); @@ -214,7 +271,7 @@ async fn delete_tasks( params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -223,22 +280,24 @@ async fn delete_tasks( } analytics.publish( - "Tasks Deleted".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); + let query = params.into_query(); let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 66b6e56de..7832c1761 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1195,8 +1195,13 @@ impl<'a> HitMaker<'a> { let vectors_is_hidden = match (&displayed_ids, vectors_fid) { // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid (None, _) => false, - // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field - (Some(_), None) => true, + // vectors has no fid, so check its explicit name + (Some(_), None) => { + // unwrap as otherwise we'd go to the first one + let displayed_names = index.displayed_fields(rtxn)?.unwrap(); + !displayed_names + .contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME) + } // displayed_ids is a finit list, so hide if `_vectors` is not part of it (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), }; diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 200f6a141..5069c9ea6 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -386,7 +386,6 @@ pub fn default_settings(dir: impl AsRef) -> Opt { db_path: dir.as_ref().join("db"), dump_dir: dir.as_ref().join("dumps"), env: "development".to_owned(), - #[cfg(feature = "analytics")] no_analytics: true, max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(), max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(), diff --git a/crates/meilisearch/tests/common/service.rs b/crates/meilisearch/tests/common/service.rs index 8addbacf8..c0b07c217 100644 --- a/crates/meilisearch/tests/common/service.rs +++ b/crates/meilisearch/tests/common/service.rs @@ -9,8 +9,9 @@ use actix_web::test; use actix_web::test::TestRequest; use actix_web::web::Data; use index_scheduler::IndexScheduler; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use meilisearch_auth::AuthController; use tracing::level_filters::LevelFilter; use tracing_subscriber::Layer; @@ -141,7 +142,7 @@ impl Service { Data::new(search_queue), self.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&self.options), + Data::new(Analytics::no_analytics()), true, )) .await diff --git a/crates/meilisearch/tests/logs/mod.rs b/crates/meilisearch/tests/logs/mod.rs index 9f4649dca..26482b561 100644 --- a/crates/meilisearch/tests/logs/mod.rs +++ b/crates/meilisearch/tests/logs/mod.rs @@ -7,8 +7,9 @@ use std::str::FromStr; use actix_web::http::header::ContentType; use actix_web::web::Data; use meili_snap::snapshot; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; @@ -54,7 +55,7 @@ async fn basic_test_log_stream_route() { Data::new(search_queue), server.service.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&server.service.options), + Data::new(Analytics::no_analytics()), true, )) .await; diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index e301c0b05..00a65d9aa 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -568,6 +568,57 @@ async fn retrieve_vectors() { ] "###); + // use explicit `_vectors` in displayed attributes + let (response, code) = index + .update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} )) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + // remove `_vectors` from displayed attributes let (response, code) = index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 4f07ca18b..ed45913a8 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -4,6 +4,53 @@ use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; use crate::vector::generate_default_user_provided_documents; +#[actix_rt::test] +async fn field_unavailable_for_source() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "default": {"source": "openAi", "revision": "42"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + #[actix_rt::test] async fn update_embedder() { let server = Server::new().await; diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 5fc2d65c8..df0e59496 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -79,8 +79,8 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" -rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } +arroy = "0.5.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } @@ -98,16 +98,7 @@ rand = { version = "0.8.5", features = ["small_rng"] } [features] all-tokenizations = [ - "charabia/chinese", - "charabia/hebrew", - "charabia/japanese", - "charabia/thai", - "charabia/korean", - "charabia/greek", - "charabia/khmer", - "charabia/vietnamese", - "charabia/swedish-recomposition", - "charabia/german-segmentation", + "charabia/default", ] # Use POSIX semaphores instead of SysV semaphores in LMDB @@ -146,5 +137,8 @@ german = ["charabia/german-segmentation"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] +# allow turkish specialized tokenization +turkish = ["charabia/turkish"] + # allow CUDA support, see cuda = ["candle-core/cuda"] diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index f4bd42689..3b48b50f2 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -298,6 +298,7 @@ impl From for Error { arroy::Error::InvalidVecDimension { expected, received } => { Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) } + arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index c47896df7..5b7a9c58c 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1610,24 +1610,6 @@ impl Index { .unwrap_or_default()) } - pub fn arroy_readers<'a>( - &'a self, - rtxn: &'a RoTxn<'a>, - embedder_id: u8, - quantized: bool, - ) -> impl Iterator> + 'a { - crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| { - let reader = ArroyWrapper::new(self.vector_arroy, k, quantized); - // Here we don't care about the dimensions, but we want to know if we can read - // in the database or if its metadata are missing because there is no document with that many vectors. - match reader.dimensions(rtxn) { - Ok(_) => Some(Ok(reader)), - Err(arroy::Error::MissingMetadata(_)) => None, - Err(e) => Some(Err(e.into())), - } - }) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1649,14 +1631,9 @@ impl Index { let embedding_configs = self.embedding_configs(rtxn)?; for config in embedding_configs { let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let embeddings = self - .arroy_readers(rtxn, embedder_id, config.config.quantized()) - .map_while(|reader| { - reader - .and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into())) - .transpose() - }) - .collect::>>()?; + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embeddings = reader.item_vectors(rtxn, docid)?; res.insert(config.name.to_owned(), embeddings); } Ok(res) diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index de1dacbe7..90377c09c 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -1,11 +1,10 @@ use std::iter::FromIterator; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::score_details::{self, ScoreDetails}; -use crate::vector::{DistributionShift, Embedder}; +use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; use crate::{DocumentId, Result, SearchContext, SearchLogger}; pub struct VectorSort { @@ -53,14 +52,9 @@ impl VectorSort { vector_candidates: &RoaringBitmap, ) -> Result<()> { let target = &self.target; - let mut results = Vec::new(); - for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) { - let nns_by_vector = - reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; - results.extend(nns_by_vector.into_iter()); - } - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); + let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); Ok(()) diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 0cb8d723d..5547d800e 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -1,10 +1,9 @@ use std::sync::Arc; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::score_details::{self, ScoreDetails}; -use crate::vector::Embedder; +use crate::vector::{ArroyWrapper, Embedder}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; pub struct Similar<'a> { @@ -71,23 +70,13 @@ impl<'a> Similar<'a> { .get(self.rtxn, &self.embedder_name)? .ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?; - let mut results = Vec::new(); - - for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) { - let nns_by_item = reader?.nns_by_item( - self.rtxn, - self.id, - self.limit + self.offset + 1, - Some(&universe), - )?; - if let Some(mut nns_by_item) = nns_by_item { - results.append(&mut nns_by_item); - } else { - break; - } - } - - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); + let results = reader.nns_by_item( + self.rtxn, + self.id, + self.limit + self.offset + 1, + Some(&universe), + )?; let mut documents_ids = Vec::with_capacity(self.limit); let mut document_scores = Vec::with_capacity(self.limit); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 326dd842d..88d20fff0 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -689,9 +689,8 @@ where key: None, }, )?; - let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap(); let reader = - ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized); + ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); let dim = reader.dimensions(self.wtxn)?; dimension.insert(name.to_string(), dim); } @@ -700,6 +699,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; let vector_arroy = self.index.vector_arroy; + let cancel = &self.should_abort; let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, @@ -713,17 +713,8 @@ where let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized); pool.install(|| { - for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { - let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized); - if is_quantizing { - writer.quantize(wtxn, k, dimension)?; - } - if writer.need_build(wtxn, dimension)? { - writer.build(wtxn, &mut rng, dimension)?; - } else if writer.is_empty(wtxn, dimension)? { - break; - } - } + let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index bb2cfe56c..763f30d0f 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -990,27 +990,24 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let readers: Result, &RoaringBitmap)>> = settings_diff + let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff .embedding_config_updates .iter() .filter_map(|(name, action)| { if let Some(WriteBackToDocuments { embedder_id, user_provided }) = action.write_back() { - let readers: Result> = self - .index - .arroy_readers(wtxn, *embedder_id, action.was_quantized) - .collect(); - match readers { - Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), - Err(error) => Some(Err(error)), - } + let reader = ArroyWrapper::new( + self.index.vector_arroy, + *embedder_id, + action.was_quantized, + ); + Some((name.as_str(), (reader, user_provided))) } else { None } }) .collect(); - let readers = readers?; let old_vectors_fid = settings_diff .old @@ -1048,34 +1045,24 @@ impl<'a, 'i> Transform<'a, 'i> { arroy::Error, > = readers .iter() - .filter_map(|(name, (readers, user_provided))| { + .filter_map(|(name, (reader, user_provided))| { if !user_provided.contains(docid) { return None; } - let mut vectors = Vec::new(); - for reader in readers { - let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { - break; - }; - - match vector { - Ok(vector) => vectors.push(vector), - Err(error) => return Some(Err(error)), - } + match reader.item_vectors(wtxn, docid) { + Ok(vectors) if vectors.is_empty() => None, + Ok(vectors) => Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(vectors), + ), + regenerate: false, + }) + .unwrap(), + ))), + Err(e) => Some(Err(e)), } - if vectors.is_empty() { - return None; - } - Some(Ok(( - name.to_string(), - serde_json::to_value(ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - vectors, - )), - regenerate: false, - }) - .unwrap(), - ))) }) .collect(); @@ -1104,11 +1091,9 @@ impl<'a, 'i> Transform<'a, 'i> { } // delete all vectors from the embedders that need removal - for (_, (readers, _)) in readers { - for reader in readers { - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; - } + for (_, (reader, _)) in readers { + let dimensions = reader.dimensions(wtxn)?; + reader.clear(wtxn, dimensions)?; } let grenad_params = GrenadParameters { diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 97a4bf712..20e70b2a6 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -673,22 +673,14 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .map_or(false, |conf| conf.2); // FIXME: allow customizing distance - let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index) - .map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized)) - .collect(); + let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - - for writer in &writers { - // Uses invariant: vectors are packed in the first writers. - if !writer.del_item(wtxn, expected_dimension, docid)? { - break; - } - } + writer.del_items(wtxn, expected_dimension, docid)?; } // add generated embeddings @@ -716,9 +708,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings.embedding_count(), ))); } - for (embedding, writer) in embeddings.iter().zip(&writers) { - writer.add_item(wtxn, expected_dimension, docid, embedding)?; - } + writer.add_items(wtxn, docid, &embeddings)?; } // perform the manual diff @@ -733,51 +723,14 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { let vector: Vec = pod_collect_to_vec(value); - let mut deleted_index = None; - for (index, writer) in writers.iter().enumerate() { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - // uses invariant: vectors are packed in the first writers. - break; - }; - if candidate == vector { - writer.del_item(wtxn, expected_dimension, docid)?; - deleted_index = Some(index); - } - } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for (index, writer) in writers.iter().enumerate().skip(deleted_index) { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - // unwrap: computed the index from the list of writers - let writer = writers.get(last_index).unwrap(); - writer.del_item(wtxn, expected_dimension, docid)?; - writers.get(deleted_index).unwrap().add_item( - wtxn, - expected_dimension, - docid, - &vector, - )?; - } - } + writer.del_item(wtxn, docid, &vector)?; } if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { let vector = pod_collect_to_vec(value); // overflow was detected during vector extraction. - for writer in &writers { - if !writer.contains_item(wtxn, expected_dimension, docid)? { - writer.add_item(wtxn, expected_dimension, docid, &vector)?; - break; - } - } + writer.add_item(wtxn, docid, &vector)?; } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index d52e68bbe..571c02c8c 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use arroy::distances::{Angular, BinaryQuantizedAngular}; +use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::ItemId; use deserr::{DeserializeError, Deserr}; use heed::{RoTxn, RwTxn, Unspecified}; @@ -32,105 +32,243 @@ pub const REQUEST_PARALLELISM: usize = 40; pub struct ArroyWrapper { quantized: bool, - index: u16, + embedder_index: u8, database: arroy::Database, } impl ArroyWrapper { - pub fn new(database: arroy::Database, index: u16, quantized: bool) -> Self { - Self { database, index, quantized } + pub fn new( + database: arroy::Database, + embedder_index: u8, + quantized: bool, + ) -> Self { + Self { database, embedder_index, quantized } } - pub fn index(&self) -> u16 { - self.index + pub fn embedder_index(&self) -> u8 { + self.embedder_index + } + + fn readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) } pub fn dimensions(&self, rtxn: &RoTxn) -> Result { + let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); if self.quantized { - Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) } else { - Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) } } - pub fn quantize( + pub fn build_and_quantize( &mut self, wtxn: &mut RwTxn, - index: u16, + rng: &mut R, dimension: usize, + quantizing: bool, + cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { - if !self.quantized { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - writer.prepare_changing_distance::(wtxn)?; - self.quantized = true; + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + writer.builder(rng).build(wtxn)? + } else if writer.is_empty(wtxn)? { + break; + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performances + // sensitive. + if quantizing && !self.quantized { + let writer = writer.prepare_changing_distance::(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; + } else if writer.need_build(wtxn)? { + writer.builder(rng).cancel(cancel).build(wtxn)?; + } else if writer.is_empty(wtxn)? { + break; + } + } } Ok(()) } - pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn) - } - } - - pub fn build( + /// Overwrite all the embeddings associated with the index and item ID. + /// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings. + /// You should call `del_items` on the `item_id` before calling this method. + /// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored. + pub fn add_items( &self, wtxn: &mut RwTxn, - rng: &mut R, - dimension: usize, + item_id: arroy::ItemId, + embeddings: &Embeddings, ) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None) + let dimension = embeddings.dimension(); + for (index, vector) in + arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + { + if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } } + Ok(()) } + /// Add one document int for this index where we can find an empty spot. pub fn add_item( &self, wtxn: &mut RwTxn, - dimension: usize, item_id: arroy::ItemId, vector: &[f32], ) -> Result<(), arroy::Error> { if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) + self._add_item(wtxn, self.quantized_db(), item_id, vector) } else { - arroy::Writer::new(self.angular_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) + self._add_item(wtxn, self.angular_db(), item_id, vector) } } - pub fn del_item( + fn _add_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + for index in arroy_db_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, vector)?; + break; + } + } + Ok(()) + } + + /// Delete all embeddings from a specific `item_id` + pub fn del_items( &self, wtxn: &mut RwTxn, dimension: usize, item_id: arroy::ItemId, + ) -> Result<(), arroy::Error> { + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if !writer.del_item(wtxn, item_id)? { + break; + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if !writer.del_item(wtxn, item_id)? { + break; + } + } + } + + Ok(()) + } + + /// Delete one item. + pub fn del_item( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + vector: &[f32], ) -> Result { if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).del_item(wtxn, item_id) + self._del_item(wtxn, self.quantized_db(), item_id, vector) } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).del_item(wtxn, item_id) + self._del_item(wtxn, self.angular_db(), item_id, vector) } } + fn _del_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result { + let dimension = vector.len(); + let mut deleted_index = None; + + for index in arroy_db_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + // uses invariant: vectors are packed in the first writers. + break; + }; + if candidate == vector { + writer.del_item(wtxn, item_id)?; + deleted_index = Some(index); + } + } + + // 🥲 enforce invariant: vectors are packed in the first writers. + if let Some(deleted_index) = deleted_index { + let mut last_index_with_a_vector = None; + for index in + arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) + { + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + break; + }; + last_index_with_a_vector = Some((index, candidate)); + } + if let Some((last_index, vector)) = last_index_with_a_vector { + let writer = arroy::Writer::new(db, last_index, dimension); + writer.del_item(wtxn, item_id)?; + let writer = arroy::Writer::new(db, deleted_index, dimension); + writer.add_item(wtxn, item_id, &vector)?; + } + } + Ok(deleted_index.is_some()) + } + pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).clear(wtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).clear(wtxn) - } - } - - pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).is_empty(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).is_empty(rtxn) + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; + } } + Ok(()) } pub fn contains_item( @@ -139,11 +277,25 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).contains_item(rtxn, item) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).contains_item(rtxn, item) + for index in arroy_db_range_for_embedder(self.embedder_index) { + let contains = if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? + }; + if contains { + return Ok(contains); + } } + Ok(false) } pub fn nns_by_item( @@ -152,45 +304,108 @@ impl ArroyWrapper { item: ItemId, limit: usize, filter: Option<&RoaringBitmap>, - ) -> Result>, arroy::Error> { + ) -> Result, arroy::Error> { if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) + self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) + self._nns_by_item(rtxn, self.angular_db(), item, limit, filter) } } + fn _nns_by_item( + &self, + rtxn: &RoTxn, + db: arroy::Database, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self.readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { + results.append(&mut ret); + } else { + break; + } + } + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + Ok(results) + } + pub fn nns_by_vector( &self, - txn: &RoTxn, - item: &[f32], + rtxn: &RoTxn, + vector: &[f32], limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { if self.quantized { - arroy::Reader::open(txn, self.index, self.quantized_db())? - .nns_by_vector(txn, item, limit, None, None, filter) + self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) } else { - arroy::Reader::open(txn, self.index, self.angular_db())? - .nns_by_vector(txn, item, limit, None, None, filter) + self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter) } } - pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result>, arroy::Error> { + fn _nns_by_vector( + &self, + rtxn: &RoTxn, + db: arroy::Database, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self.readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); + } + + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) + } + + pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result>, arroy::Error> { + let mut vectors = Vec::new(); + if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())?.item_vector(rtxn, docid) + for reader in self.readers(rtxn, self.quantized_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } + } } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())?.item_vector(rtxn, docid) + for reader in self.readers(rtxn, self.angular_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } + } } + Ok(vectors) } - fn angular_db(&self) -> arroy::Database { + fn angular_db(&self) -> arroy::Database { self.database.remap_data_type() } - fn quantized_db(&self) -> arroy::Database { + fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } } diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 3bb7f09e6..d1cf364a2 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -417,6 +417,8 @@ impl EmbeddingSettings { pub const DISTRIBUTION: &'static str = "distribution"; + pub const BINARY_QUANTIZED: &'static str = "binaryQuantized"; + pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { match field { Self::SOURCE => &[ @@ -456,6 +458,13 @@ impl EmbeddingSettings { EmbedderSource::Rest, EmbedderSource::UserProvided, ], + Self::BINARY_QUANTIZED => &[ + EmbedderSource::HuggingFace, + EmbedderSource::Ollama, + EmbedderSource::OpenAi, + EmbedderSource::Rest, + EmbedderSource::UserProvided, + ], _other => unreachable!("unknown field"), } } @@ -470,6 +479,7 @@ impl EmbeddingSettings { Self::DIMENSIONS, Self::DISTRIBUTION, Self::URL, + Self::BINARY_QUANTIZED, ], EmbedderSource::HuggingFace => &[ Self::SOURCE, @@ -477,6 +487,7 @@ impl EmbeddingSettings { Self::REVISION, Self::DOCUMENT_TEMPLATE, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], EmbedderSource::Ollama => &[ Self::SOURCE, @@ -486,8 +497,11 @@ impl EmbeddingSettings { Self::API_KEY, Self::DIMENSIONS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], - EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION], + EmbedderSource::UserProvided => { + &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED] + } EmbedderSource::Rest => &[ Self::SOURCE, Self::API_KEY, @@ -498,6 +512,7 @@ impl EmbeddingSettings { Self::RESPONSE, Self::HEADERS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], } } diff --git a/workloads/search/embeddings-movies-subset-hf.json b/workloads/search/embeddings-movies-subset-hf.json index aeeecac59..36f45cfb9 100644 --- a/workloads/search/embeddings-movies-subset-hf.json +++ b/workloads/search/embeddings-movies-subset-hf.json @@ -77,7 +77,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } }, @@ -91,7 +92,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -105,7 +107,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.9 + "semanticRatio": 0.9, + "embedder": "default" } } }, @@ -119,7 +122,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -133,7 +137,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -147,7 +152,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -161,7 +167,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } },