diff --git a/.github/workflows/benchmarks-manual.yml b/.github/workflows/benchmarks-manual.yml index da33bf803..14b77c83d 100644 --- a/.github/workflows/benchmarks-manual.yml +++ b/.github/workflows/benchmarks-manual.yml @@ -43,7 +43,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml index f9d609d6e..a083baa3c 100644 --- a/.github/workflows/benchmarks-pr.yml +++ b/.github/workflows/benchmarks-pr.yml @@ -88,7 +88,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-indexing.yml b/.github/workflows/benchmarks-push-indexing.yml index 1fdd5fd67..4495b4b9d 100644 --- a/.github/workflows/benchmarks-push-indexing.yml +++ b/.github/workflows/benchmarks-push-indexing.yml @@ -41,7 +41,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-search-songs.yml b/.github/workflows/benchmarks-push-search-songs.yml index b6169ddf7..e9744a434 100644 --- a/.github/workflows/benchmarks-push-search-songs.yml +++ b/.github/workflows/benchmarks-push-search-songs.yml @@ -40,7 +40,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-search-wiki.yml b/.github/workflows/benchmarks-push-search-wiki.yml index dd3146a14..bc9e1bcd0 100644 --- a/.github/workflows/benchmarks-push-search-wiki.yml +++ b/.github/workflows/benchmarks-push-search-wiki.yml @@ -40,7 +40,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 016a9d282..c53946fea 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -65,9 +65,9 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] include: - - os: macos-12 + - os: macos-13 artifact_name: meilisearch asset_name: meilisearch-macos-amd64 - os: windows-2022 @@ -90,7 +90,7 @@ jobs: publish-macos-apple-silicon: name: Publish binary for macOS silicon - runs-on: macos-12 + runs-on: macos-13 needs: check-version strategy: matrix: diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index ce7fb30b6..90fb03538 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -51,7 +51,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] steps: - uses: actions/checkout@v3 - name: Cache dependencies diff --git a/Cargo.lock b/Cargo.lock index 5f906703a..c2b3a673e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,8 +386,28 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arroy" -version = "0.4.0" -source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e" +dependencies = [ + "bytemuck", + "byteorder", + "heed", + "log", + "memmap2", + "nohash", + "ordered-float", + "rand", + "rayon", + "roaring", + "tempfile", + "thiserror", +] + +[[package]] +name = "arroy" +version = "0.5.0" +source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7" dependencies = [ "bytemuck", "byteorder", @@ -706,9 +726,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] @@ -2555,7 +2575,7 @@ name = "index-scheduler" version = "1.11.0" dependencies = [ "anyhow", - "arroy", + "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "big_s", "bincode", "crossbeam", @@ -3414,6 +3434,7 @@ dependencies = [ "meilisearch-types", "mimalloc", "mime", + "mopa-maintained", "num_cpus", "obkv", "once_cell", @@ -3516,6 +3537,7 @@ name = "meilitool" version = "1.11.0" dependencies = [ "anyhow", + "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)", "clap", "dump", "file-store", @@ -3546,7 +3568,7 @@ dependencies = [ name = "milli" version = "1.11.0" dependencies = [ - "arroy", + "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "big_s", "bimap", "bincode", @@ -3681,6 +3703,12 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "mopa-maintained" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11" + [[package]] name = "mutually_exclusive_features" version = "0.0.3" @@ -4582,9 +4610,8 @@ dependencies = [ [[package]] name = "rhai" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702" +version = "1.20.0" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4601,8 +4628,7 @@ dependencies = [ [[package]] name = "rhai_codegen" version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "proc-macro2", "quote", diff --git a/bors.toml b/bors.toml index 8750ed993..96e9ef65e 100644 --- a/bors.toml +++ b/bors.toml @@ -1,6 +1,6 @@ status = [ 'Tests on ubuntu-20.04', - 'Tests on macos-12', + 'Tests on macos-13', 'Tests on windows-2022', 'Run Clippy', 'Run Rustfmt', diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 432a86382..e80311005 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -40,7 +40,7 @@ ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } [dev-dependencies] -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.39.0", features = ["json", "redactions"] } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index fe8244f9b..e0e2bfb75 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1263,7 +1263,7 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - tracing::info!("Batch failed {}", error); + tracing::error!("Batch failed {}", error); self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 55d31b8cf..de42e5586 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -67,5 +67,8 @@ khmer = ["milli/khmer"] vietnamese = ["milli/vietnamese"] # force swedish character recomposition swedish-recomposition = ["milli/swedish-recomposition"] -# force german character recomposition +# allow german tokenization german = ["milli/german"] +# allow turkish normalization +turkish = ["milli/turkish"] + diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 2a16e1017..57202f59f 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [ rustls = { version = "0.23.11", features = ["ring"], default-features = false } rustls-pki-types = { version = "1.7.0", features = ["alloc"] } rustls-pemfile = "2.1.2" -segment = { version = "0.2.4", optional = true } +segment = { version = "0.2.4" } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" @@ -104,6 +104,7 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } roaring = "0.10.2" +mopa-maintained = "0.2.3" [dev-dependencies] actix-rt = "2.10.0" @@ -131,8 +132,7 @@ tempfile = { version = "3.10.1", optional = true } zip = { version = "2.1.3", optional = true } [features] -default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"] -analytics = ["segment"] +default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] mini-dashboard = [ "static-files", "anyhow", @@ -154,7 +154,8 @@ khmer = ["meilisearch-types/khmer"] vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] german = ["meilisearch-types/german"] +turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" -sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" +sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" diff --git a/crates/meilisearch/src/analytics/mod.rs b/crates/meilisearch/src/analytics/mod.rs index 3c7ca0ed3..bd14b0bfa 100644 --- a/crates/meilisearch/src/analytics/mod.rs +++ b/crates/meilisearch/src/analytics/mod.rs @@ -1,44 +1,45 @@ -mod mock_analytics; -#[cfg(feature = "analytics")] -mod segment_analytics; +pub mod segment_analytics; use std::fs; use std::path::{Path, PathBuf}; use std::str::FromStr; +use std::sync::Arc; use actix_web::HttpRequest; +use index_scheduler::IndexScheduler; +use meilisearch_auth::AuthController; use meilisearch_types::InstanceUid; -pub use mock_analytics::MockAnalytics; +use mopa::mopafy; use once_cell::sync::Lazy; use platform_dirs::AppDirs; -use serde_json::Value; - -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; - -// if the analytics feature is disabled -// the `SegmentAnalytics` point to the mock instead of the real analytics -#[cfg(not(feature = "analytics"))] -pub type SegmentAnalytics = mock_analytics::MockAnalytics; -#[cfg(not(feature = "analytics"))] -pub type SearchAggregator = mock_analytics::SearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type SimilarAggregator = mock_analytics::SimilarAggregator; -#[cfg(not(feature = "analytics"))] -pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator; // if the feature analytics is enabled we use the real analytics -#[cfg(feature = "analytics")] pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; -#[cfg(feature = "analytics")] -pub type SearchAggregator = segment_analytics::SearchAggregator; -#[cfg(feature = "analytics")] -pub type SimilarAggregator = segment_analytics::SimilarAggregator; -#[cfg(feature = "analytics")] -pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; -#[cfg(feature = "analytics")] -pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; + +use crate::Opt; + +/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. +#[macro_export] +macro_rules! empty_analytics { + ($struct_name:ident, $event_name:literal) => { + #[derive(Default)] + struct $struct_name {} + + impl $crate::analytics::Aggregate for $struct_name { + fn event_name(&self) -> &'static str { + $event_name + } + + fn aggregate(self: Box, _other: Box) -> Box { + self + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::json!({}) + } + } + }; +} /// The Meilisearch config dir: /// `~/.config/Meilisearch` on *NIX or *BSD. @@ -78,60 +79,88 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -pub trait Analytics: Sync + Send { - fn instance_uid(&self) -> Option<&InstanceUid>; +/// To send an event to segment, your event must be able to aggregate itself with another event of the same type. +pub trait Aggregate: 'static + mopa::Any + Send { + /// The name of the event that will be sent to segment. + fn event_name(&self) -> &'static str; + + /// Will be called every time an event has been used twice before segment flushed its buffer. + fn aggregate(self: Box, new: Box) -> Box + where + Self: Sized; + + /// Converts your structure to the final event that'll be sent to segment. + fn into_event(self: Box) -> serde_json::Value; +} + +mopafy!(Aggregate); + +/// Helper trait to define multiple aggregates with the same content but a different name. +/// Commonly used when you must aggregate a search with POST or with GET, for example. +pub trait AggregateMethod: 'static + Default + Send { + fn event_name() -> &'static str; +} + +/// A macro used to quickly define multiple aggregate method with their name +/// Usage: +/// ```rust +/// use meilisearch::aggregate_methods; +/// +/// aggregate_methods!( +/// SearchGET => "Documents Searched GET", +/// SearchPOST => "Documents Searched POST", +/// ); +/// ``` +#[macro_export] +macro_rules! aggregate_methods { + ($method:ident => $event_name:literal) => { + #[derive(Default)] + pub struct $method {} + + impl $crate::analytics::AggregateMethod for $method { + fn event_name() -> &'static str { + $event_name + } + } + }; + ($($method:ident => $event_name:literal,)+) => { + $( + aggregate_methods!($method => $event_name); + )+ + + }; +} + +#[derive(Clone)] +pub struct Analytics { + segment: Option>, +} + +impl Analytics { + pub async fn new( + opt: &Opt, + index_scheduler: Arc, + auth_controller: Arc, + ) -> Self { + if opt.no_analytics { + Self { segment: None } + } else { + Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await } + } + } + + pub fn no_analytics() -> Self { + Self { segment: None } + } + + pub fn instance_uid(&self) -> Option<&InstanceUid> { + self.segment.as_ref().map(|segment| segment.instance_uid.as_ref()) + } /// The method used to publish most analytics that do not need to be batched every hours - fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>); - - /// This method should be called to aggregate a get search - fn get_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a post search - fn post_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a get similar request - fn get_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post similar request - fn post_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post array of searches - fn post_multi_search(&self, aggregate: MultiSearchAggregator); - - /// This method should be called to aggregate post facet values searches - fn post_facet_search(&self, aggregate: FacetSearchAggregator); - - // this method should be called to aggregate an add documents request - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to aggregate a fetch documents request - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a fetch documents request - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a add documents request - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest); - - // this method should be called to batch an update documents request - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to batch an update documents by function request - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ); + pub fn publish(&self, event: T, request: &HttpRequest) { + if let Some(ref segment) = self.segment { + let _ = segment.sender.try_send(segment_analytics::Message::new(event, request)); + } + } } diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index f8d6a0fdc..7dc746b14 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -1,19 +1,18 @@ -use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; +use std::any::TypeId; +use std::collections::{HashMap, HashSet}; use std::fs; -use std::mem::take; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, Instant}; -use actix_web::http::header::{CONTENT_TYPE, USER_AGENT}; +use actix_web::http::header::USER_AGENT; use actix_web::HttpRequest; use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; -use meilisearch_types::locales::Locale; +use meilisearch_types::features::RuntimeTogglableFeatures; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; -use regex::Regex; use segment::message::{Identify, Track, User}; use segment::{AutoBatcher, Batcher, HttpClient}; use serde::Serialize; @@ -24,22 +23,11 @@ use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; -use super::{ - config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, -}; -use crate::analytics::Analytics; +use super::{config_user_id_path, Aggregate, MEILISEARCH_CONFIG_PATH}; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; -use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::{create_all_stats, Stats}; -use crate::search::{ - FacetSearchResult, FederatedSearch, MatchingStrategy, SearchQuery, SearchQueryWithIndex, - SearchResult, SimilarQuery, SimilarResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEMANTIC_RATIO, -}; use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; @@ -57,7 +45,7 @@ fn write_user_id(db_path: &Path, user_id: &InstanceUid) { const SEGMENT_API_KEY: &str = "P3FWhhEsJiEDCuEHpmcN9DHcK4hVfBvb"; -pub fn extract_user_agents(request: &HttpRequest) -> Vec { +pub fn extract_user_agents(request: &HttpRequest) -> HashSet { request .headers() .get(ANALYTICS_HEADER) @@ -70,26 +58,58 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { .collect() } -pub enum AnalyticsMsg { - BatchMessage(Track), - AggregateGetSearch(SearchAggregator), - AggregatePostSearch(SearchAggregator), - AggregateGetSimilar(SimilarAggregator), - AggregatePostSimilar(SimilarAggregator), - AggregatePostMultiSearch(MultiSearchAggregator), - AggregatePostFacetSearch(FacetSearchAggregator), - AggregateAddDocuments(DocumentsAggregator), - AggregateDeleteDocuments(DocumentsDeletionAggregator), - AggregateUpdateDocuments(DocumentsAggregator), - AggregateEditDocumentsByFunction(EditDocumentsByFunctionAggregator), - AggregateGetFetchDocuments(DocumentsFetchAggregator), - AggregatePostFetchDocuments(DocumentsFetchAggregator), +pub struct Message { + // Since the type_id is solved statically we cannot retrieve it from the Box. + // Thus we have to send it in the message directly. + type_id: TypeId, + // Same for the aggregate function. + #[allow(clippy::type_complexity)] + aggregator_function: fn(Box, Box) -> Option>, + event: Event, +} + +pub struct Event { + original: Box, + timestamp: OffsetDateTime, + user_agents: HashSet, + total: usize, +} + +/// This function should always be called on the same type. If `this` and `other` +/// aren't the same type the function will do nothing and return `None`. +fn downcast_aggregate( + old: Box, + new: Box, +) -> Option> { + if old.is::() && new.is::() { + // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping + let this = old.downcast::().ok()?; + let other = new.downcast::().ok()?; + Some(ConcreteType::aggregate(this, other)) + } else { + None + } +} + +impl Message { + pub fn new(event: T, request: &HttpRequest) -> Self { + Self { + type_id: TypeId::of::(), + event: Event { + original: Box::new(event), + timestamp: OffsetDateTime::now_utc(), + user_agents: extract_user_agents(request), + total: 1, + }, + aggregator_function: downcast_aggregate::, + } + } } pub struct SegmentAnalytics { - instance_uid: InstanceUid, - sender: Sender, - user: User, + pub instance_uid: InstanceUid, + pub user: User, + pub sender: Sender, } impl SegmentAnalytics { @@ -98,7 +118,7 @@ impl SegmentAnalytics { opt: &Opt, index_scheduler: Arc, auth_controller: Arc, - ) -> Arc { + ) -> Option> { let instance_uid = super::find_user_id(&opt.db_path); let first_time_run = instance_uid.is_none(); let instance_uid = instance_uid.unwrap_or_else(Uuid::new_v4); @@ -108,7 +128,7 @@ impl SegmentAnalytics { // if reqwest throws an error we won't be able to send analytics if client.is_err() { - return super::MockAnalytics::new(opt); + return None; } let client = @@ -144,113 +164,13 @@ impl SegmentAnalytics { user: user.clone(), opt: opt.clone(), batcher, - post_search_aggregator: SearchAggregator::default(), - post_multi_search_aggregator: MultiSearchAggregator::default(), - post_facet_search_aggregator: FacetSearchAggregator::default(), - get_search_aggregator: SearchAggregator::default(), - add_documents_aggregator: DocumentsAggregator::default(), - delete_documents_aggregator: DocumentsDeletionAggregator::default(), - update_documents_aggregator: DocumentsAggregator::default(), - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator::default(), - get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - get_similar_aggregator: SimilarAggregator::default(), - post_similar_aggregator: SimilarAggregator::default(), + events: HashMap::new(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); let this = Self { instance_uid, sender, user: user.clone() }; - Arc::new(this) - } -} - -impl super::Analytics for SegmentAnalytics { - fn instance_uid(&self) -> Option<&InstanceUid> { - Some(&self.instance_uid) - } - - fn publish(&self, event_name: String, mut send: Value, request: Option<&HttpRequest>) { - let user_agent = request.map(extract_user_agents); - - send["user-agent"] = json!(user_agent); - let event = Track { - user: self.user.clone(), - event: event_name.clone(), - properties: send, - ..Default::default() - }; - let _ = self.sender.try_send(AnalyticsMsg::BatchMessage(event)); - } - - fn get_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSearch(aggregate)); - } - - fn post_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSearch(aggregate)); - } - - fn get_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSimilar(aggregate)); - } - - fn post_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSimilar(aggregate)); - } - - fn post_facet_search(&self, aggregate: FacetSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFacetSearch(aggregate)); - } - - fn post_multi_search(&self, aggregate: MultiSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostMultiSearch(aggregate)); - } - - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateAddDocuments(aggregate)); - } - - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest) { - let aggregate = DocumentsDeletionAggregator::from_query(kind, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateDeleteDocuments(aggregate)); - } - - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateUpdateDocuments(aggregate)); - } - - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = - EditDocumentsByFunctionAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateEditDocumentsByFunction(aggregate)); - } - - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetFetchDocuments(aggregate)); - } - - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); + Some(Arc::new(this)) } } @@ -263,8 +183,12 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_contains_filter: bool, + experimental_vector_store: bool, experimental_enable_metrics: bool, + experimental_edit_documents_by_function: bool, experimental_search_queue_size: usize, + experimental_drop_search_after: usize, + experimental_nb_searches_per_core: usize, experimental_logs_mode: LogMode, experimental_replication_parameters: bool, experimental_enable_logs_route: bool, @@ -298,8 +222,8 @@ struct Infos { ssl_tickets: bool, } -impl From for Infos { - fn from(options: Opt) -> Self { +impl Infos { + pub fn new(options: Opt, features: RuntimeTogglableFeatures) -> Self { // We wants to decompose this whole struct by hand to be sure we don't forget // to add analytics when we add a field in the Opt. // Thus we must not insert `..` at the end. @@ -308,6 +232,8 @@ impl From for Infos { experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, @@ -340,8 +266,7 @@ impl From for Infos { log_level, indexer_options, config_file_path, - #[cfg(feature = "analytics")] - no_analytics: _, + no_analytics: _, } = options; let schedule_snapshot = match schedule_snapshot { @@ -352,16 +277,28 @@ impl From for Infos { let IndexerOpts { max_indexing_memory, max_indexing_threads, skip_index_budget: _ } = indexer_options; + let RuntimeTogglableFeatures { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + } = features; + // We're going to override every sensible information. // We consider information sensible if it contains a path, an address, or a key. Self { env, - experimental_contains_filter, - experimental_enable_metrics, + experimental_contains_filter: experimental_contains_filter | contains_filter, + experimental_vector_store: vector_store, + experimental_edit_documents_by_function: edit_documents_by_function, + experimental_enable_metrics: experimental_enable_metrics | metrics, experimental_search_queue_size, + experimental_drop_search_after: experimental_drop_search_after.into(), + experimental_nb_searches_per_core: experimental_nb_searches_per_core.into(), experimental_logs_mode, experimental_replication_parameters, - experimental_enable_logs_route, + experimental_enable_logs_route: experimental_enable_logs_route | logs_route, experimental_reduce_indexing_memory_usage, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), @@ -395,26 +332,15 @@ impl From for Infos { } pub struct Segment { - inbox: Receiver, + inbox: Receiver, user: User, opt: Opt, batcher: AutoBatcher, - get_search_aggregator: SearchAggregator, - post_search_aggregator: SearchAggregator, - post_multi_search_aggregator: MultiSearchAggregator, - post_facet_search_aggregator: FacetSearchAggregator, - add_documents_aggregator: DocumentsAggregator, - delete_documents_aggregator: DocumentsDeletionAggregator, - update_documents_aggregator: DocumentsAggregator, - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator, - get_fetch_documents_aggregator: DocumentsFetchAggregator, - post_fetch_documents_aggregator: DocumentsFetchAggregator, - get_similar_aggregator: SimilarAggregator, - post_similar_aggregator: SimilarAggregator, + events: HashMap, } impl Segment { - fn compute_traits(opt: &Opt, stats: Stats) -> Value { + fn compute_traits(opt: &Opt, stats: Stats, features: RuntimeTogglableFeatures) -> Value { static FIRST_START_TIMESTAMP: Lazy = Lazy::new(Instant::now); static SYSTEM: Lazy = Lazy::new(|| { let disks = Disks::new_with_refreshed_list(); @@ -442,7 +368,7 @@ impl Segment { "indexes_number": stats.indexes.len(), "documents_number": number_of_documents, }, - "infos": Infos::from(opt.clone()), + "infos": Infos::new(opt.clone(), features), }) } @@ -461,36 +387,44 @@ impl Segment { _ = interval.tick() => { self.tick(index_scheduler.clone(), auth_controller.clone()).await; }, - msg = self.inbox.recv() => { - match msg { - Some(AnalyticsMsg::BatchMessage(msg)) => drop(self.batcher.push(msg).await), - Some(AnalyticsMsg::AggregateGetSearch(agreg)) => self.get_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSearch(agreg)) => self.post_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostMultiSearch(agreg)) => self.post_multi_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFacetSearch(agreg)) => self.post_facet_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateEditDocumentsByFunction(agreg)) => self.edit_documents_by_function_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetSimilar(agreg)) => self.get_similar_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSimilar(agreg)) => self.post_similar_aggregator.aggregate(agreg), - None => (), - } - } + Some(msg) = self.inbox.recv() => { + self.handle_msg(msg); + } } } } + fn handle_msg(&mut self, Message { type_id, aggregator_function, event }: Message) { + let new_event = match self.events.remove(&type_id) { + Some(old) => { + // The function should never fail since we retrieved the corresponding TypeId in the map. But in the unfortunate + // case it could happens we're going to silently ignore the error + let Some(original) = (aggregator_function)(old.original, event.original) else { + return; + }; + Event { + original, + // We always want to return the FIRST timestamp ever encountered + timestamp: old.timestamp, + user_agents: old.user_agents.union(&event.user_agents).cloned().collect(), + total: old.total.saturating_add(event.total), + } + } + None => event, + }; + self.events.insert(type_id, new_event); + } + async fn tick( &mut self, index_scheduler: Arc, auth_controller: Arc, ) { - if let Ok(stats) = - create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default()) - { + if let Ok(stats) = create_all_stats( + index_scheduler.clone().into(), + auth_controller.into(), + &AuthFilter::default(), + ) { // Replace the version number with the prototype name if any. let version = if let Some(prototype) = build_info::DescribeResult::from_build() .and_then(|describe| describe.as_prototype()) @@ -509,1486 +443,42 @@ impl Segment { }, })), user: self.user.clone(), - traits: Self::compute_traits(&self.opt, stats), + traits: Self::compute_traits( + &self.opt, + stats, + index_scheduler.features().runtime_features(), + ), ..Default::default() }) .await; } - let Segment { - inbox: _, - opt: _, - batcher: _, - user, - get_search_aggregator, - post_search_aggregator, - post_multi_search_aggregator, - post_facet_search_aggregator, - add_documents_aggregator, - delete_documents_aggregator, - update_documents_aggregator, - edit_documents_by_function_aggregator, - get_fetch_documents_aggregator, - post_fetch_documents_aggregator, - get_similar_aggregator, - post_similar_aggregator, - } = self; + // We empty the list of events + let events = std::mem::take(&mut self.events); - if let Some(get_search) = - take(get_search_aggregator).into_event(user, "Documents Searched GET") - { - let _ = self.batcher.push(get_search).await; - } - if let Some(post_search) = - take(post_search_aggregator).into_event(user, "Documents Searched POST") - { - let _ = self.batcher.push(post_search).await; - } - if let Some(post_multi_search) = take(post_multi_search_aggregator) - .into_event(user, "Documents Searched by Multi-Search POST") - { - let _ = self.batcher.push(post_multi_search).await; - } - if let Some(post_facet_search) = - take(post_facet_search_aggregator).into_event(user, "Facet Searched POST") - { - let _ = self.batcher.push(post_facet_search).await; - } - if let Some(add_documents) = - take(add_documents_aggregator).into_event(user, "Documents Added") - { - let _ = self.batcher.push(add_documents).await; - } - if let Some(delete_documents) = - take(delete_documents_aggregator).into_event(user, "Documents Deleted") - { - let _ = self.batcher.push(delete_documents).await; - } - if let Some(update_documents) = - take(update_documents_aggregator).into_event(user, "Documents Updated") - { - let _ = self.batcher.push(update_documents).await; - } - if let Some(edit_documents_by_function) = take(edit_documents_by_function_aggregator) - .into_event(user, "Documents Edited By Function") - { - let _ = self.batcher.push(edit_documents_by_function).await; - } - if let Some(get_fetch_documents) = - take(get_fetch_documents_aggregator).into_event(user, "Documents Fetched GET") - { - let _ = self.batcher.push(get_fetch_documents).await; - } - if let Some(post_fetch_documents) = - take(post_fetch_documents_aggregator).into_event(user, "Documents Fetched POST") - { - let _ = self.batcher.push(post_fetch_documents).await; + for (_, event) in events { + let Event { original, timestamp, user_agents, total } = event; + let name = original.event_name(); + let mut properties = original.into_event(); + if properties["user-agent"].is_null() { + properties["user-agent"] = json!(user_agents); + }; + if properties["requests"]["total_received"].is_null() { + properties["requests"]["total_received"] = total.into(); + }; + + let _ = self + .batcher + .push(Track { + user: self.user.clone(), + event: name.to_string(), + properties, + timestamp: Some(timestamp), + ..Default::default() + }) + .await; } - if let Some(get_similar_documents) = - take(get_similar_aggregator).into_event(user, "Similar GET") - { - let _ = self.batcher.push(get_similar_documents).await; - } - - if let Some(post_similar_documents) = - take(post_similar_aggregator).into_event(user, "Similar POST") - { - let _ = self.batcher.push(post_similar_documents).await; - } let _ = self.batcher.flush().await; } } - -#[derive(Default)] -pub struct SearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - total_degraded: usize, - total_used_negative_operator: usize, - time_spent: BinaryHeap, - - // sort - sort_with_geo_point: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - sort_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - sort_total_number_of_criteria: usize, - - // distinct - distinct: bool, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // attributes_to_search_on - // every time a search is done using attributes_to_search_on - attributes_to_search_on_total_number_of_uses: usize, - - // q - // The maximum number of terms in a q request - max_terms_number: usize, - - // vector - // The maximum number of floats in a vector request - max_vector_size: usize, - // Whether the semantic ratio passed to a hybrid search equals the default ratio. - semantic_ratio: bool, - hybrid: bool, - retrieve_vectors: bool, - - // every time a search is done, we increment the counter linked to the used settings - matching_strategy: HashMap, - - // List of the unique Locales passed as parameter - locales: BTreeSet, - - // pagination - max_limit: usize, - max_offset: usize, - finite_pagination: usize, - - // formatting - max_attributes_to_retrieve: usize, - max_attributes_to_highlight: usize, - highlight_pre_tag: bool, - highlight_post_tag: bool, - max_attributes_to_crop: usize, - crop_marker: bool, - show_matches_position: bool, - crop_length: bool, - - // facets - facets_sum_of_terms: usize, - facets_total_number_of_facets: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, -} - -impl SearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SearchQuery, request: &HttpRequest) -> Self { - let SearchQuery { - q, - vector, - offset, - limit, - page, - hits_per_page, - attributes_to_retrieve: _, - retrieve_vectors, - attributes_to_crop: _, - crop_length, - attributes_to_highlight: _, - show_matches_position, - show_ranking_score, - show_ranking_score_details, - filter, - sort, - distinct, - facets: _, - highlight_pre_tag, - highlight_post_tag, - crop_marker, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - - if let Some(ref sort) = sort { - ret.sort_total_number_of_criteria = 1; - ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); - ret.sort_sum_of_criteria_terms = sort.len(); - } - - ret.distinct = distinct.is_some(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - // attributes_to_search_on - if attributes_to_search_on.is_some() { - ret.attributes_to_search_on_total_number_of_uses = 1; - } - - if let Some(ref q) = q { - ret.max_terms_number = q.split_whitespace().count(); - } - - if let Some(ref vector) = vector { - ret.max_vector_size = vector.len(); - } - ret.retrieve_vectors |= retrieve_vectors; - - if query.is_finite_pagination() { - let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); - ret.max_limit = limit; - ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; - ret.finite_pagination = 1; - } else { - ret.max_limit = *limit; - ret.max_offset = *offset; - ret.finite_pagination = 0; - } - - ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); - - if let Some(locales) = locales { - ret.locales = locales.iter().copied().collect(); - } - - ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); - ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); - ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); - ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); - ret.show_matches_position = *show_matches_position; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - if let Some(hybrid) = hybrid { - ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); - ret.hybrid = true; - } - - ret - } - - pub fn succeed(&mut self, result: &SearchResult) { - let SearchResult { - hits: _, - query: _, - processing_time_ms, - hits_info: _, - semantic_hit_count: _, - facet_distribution: _, - facet_stats: _, - degraded, - used_negative_operator, - } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - if *degraded { - self.total_degraded = self.total_degraded.saturating_add(1); - } - if *used_negative_operator { - self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); - } - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [SearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - ref mut locales, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.total_degraded = self.total_degraded.saturating_add(total_degraded); - self.total_used_negative_operator = - self.total_used_negative_operator.saturating_add(total_used_negative_operator); - self.time_spent.append(time_spent); - - // sort - self.sort_with_geo_point |= sort_with_geo_point; - self.sort_sum_of_criteria_terms = - self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); - self.sort_total_number_of_criteria = - self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); - - // distinct - self.distinct |= distinct; - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - // attributes_to_search_on - self.attributes_to_search_on_total_number_of_uses = self - .attributes_to_search_on_total_number_of_uses - .saturating_add(attributes_to_search_on_total_number_of_uses); - - // q - self.max_terms_number = self.max_terms_number.max(max_terms_number); - - // vector - self.max_vector_size = self.max_vector_size.max(max_vector_size); - self.retrieve_vectors |= retrieve_vectors; - self.semantic_ratio |= semantic_ratio; - self.hybrid |= hybrid; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - self.finite_pagination += finite_pagination; - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - self.max_attributes_to_highlight = - self.max_attributes_to_highlight.max(max_attributes_to_highlight); - self.highlight_pre_tag |= highlight_pre_tag; - self.highlight_post_tag |= highlight_post_tag; - self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); - self.crop_marker |= crop_marker; - self.show_matches_position |= show_matches_position; - self.crop_length |= crop_length; - - // facets - self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); - self.facets_total_number_of_facets = - self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); - - // matching strategy - for (key, value) in matching_strategy.into_iter() { - let matching_strategy = self.matching_strategy.entry(key).or_insert(0); - *matching_strategy = matching_strategy.saturating_add(value); - } - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - // locales - self.locales.append(locales); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - locales, - } = self; - - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - "total_degraded": total_degraded, - "total_used_negative_operator": total_used_negative_operator, - }, - "sort": { - "with_geoPoint": sort_with_geo_point, - "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), - }, - "distinct": distinct, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "attributes_to_search_on": { - "total_number_of_uses": attributes_to_search_on_total_number_of_uses, - }, - "q": { - "max_terms_number": max_terms_number, - }, - "vector": { - "max_vector_size": max_vector_size, - "retrieve_vectors": retrieve_vectors, - }, - "hybrid": { - "enabled": hybrid, - "semantic_ratio": semantic_ratio, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - "max_attributes_to_highlight": max_attributes_to_highlight, - "highlight_pre_tag": highlight_pre_tag, - "highlight_post_tag": highlight_post_tag, - "max_attributes_to_crop": max_attributes_to_crop, - "crop_marker": crop_marker, - "show_matches_position": show_matches_position, - "crop_length": crop_length, - }, - "facets": { - "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), - }, - "matching_strategy": { - "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "locales": locales, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct MultiSearchAggregator { - timestamp: Option, - - // requests - total_received: usize, - total_succeeded: usize, - - // sum of the number of distinct indexes in each single request, use with total_received to compute an avg - total_distinct_index_count: usize, - // number of queries with a single index, use with total_received to compute a proportion - total_single_index: usize, - - // sum of the number of search queries in the requests, use with total_received to compute an average - total_search_count: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - - // federation - use_federation: bool, - - // context - user_agents: HashSet, -} - -impl MultiSearchAggregator { - pub fn from_federated_search( - federated_search: &FederatedSearch, - request: &HttpRequest, - ) -> Self { - let timestamp = Some(OffsetDateTime::now_utc()); - - let user_agents = extract_user_agents(request).into_iter().collect(); - - let use_federation = federated_search.federation.is_some(); - - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; - - index_uid.as_str() - }) - .collect(); - - let show_ranking_score = - federated_search.queries.iter().any(|query| query.show_ranking_score); - let show_ranking_score_details = - federated_search.queries.iter().any(|query| query.show_ranking_score_details); - - Self { - timestamp, - total_received: 1, - total_succeeded: 0, - total_distinct_index_count: distinct_indexes.len(), - total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, - total_search_count: federated_search.queries.len(), - show_ranking_score, - show_ranking_score_details, - user_agents, - use_federation, - } - } - - pub fn succeed(&mut self) { - self.total_succeeded = self.total_succeeded.saturating_add(1); - } - - /// Aggregate one [MultiSearchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - // write the aggregate in a way that will cause a compilation error if a field is added. - - // get ownership of self, replacing it by a default value. - let this = std::mem::take(self); - - let timestamp = this.timestamp.or(other.timestamp); - let total_received = this.total_received.saturating_add(other.total_received); - let total_succeeded = this.total_succeeded.saturating_add(other.total_succeeded); - let total_distinct_index_count = - this.total_distinct_index_count.saturating_add(other.total_distinct_index_count); - let total_single_index = this.total_single_index.saturating_add(other.total_single_index); - let total_search_count = this.total_search_count.saturating_add(other.total_search_count); - let show_ranking_score = this.show_ranking_score || other.show_ranking_score; - let show_ranking_score_details = - this.show_ranking_score_details || other.show_ranking_score_details; - let mut user_agents = this.user_agents; - let use_federation = this.use_federation || other.use_federation; - - for user_agent in other.user_agents.into_iter() { - user_agents.insert(user_agent); - } - - // need all fields or compile error - let mut aggregated = Self { - timestamp, - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - user_agents, - show_ranking_score, - show_ranking_score_details, - use_federation, - // do not add _ or ..Default::default() here - }; - - // replace the default self with the aggregated value - std::mem::swap(self, &mut aggregated); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - user_agents, - show_ranking_score, - show_ranking_score_details, - use_federation, - } = self; - - if total_received == 0 { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "requests": { - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "indexes": { - "total_single_index": total_single_index, - "total_distinct_index_count": total_distinct_index_count, - "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early - }, - "searches": { - "total_search_count": total_search_count, - "avg_search_count": (total_search_count as f64) / (total_received as f64), - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - }, - "federation": { - "use_federation": use_federation, - } - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct FacetSearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // The set of all facetNames that were used - facet_names: HashSet, - - // As there been any other parameter than the facetName or facetQuery ones? - additional_search_parameters_provided: bool, -} - -impl FacetSearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self { - let FacetSearchQuery { - facet_query: _, - facet_name, - vector, - q, - filter, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - ret.facet_names = Some(facet_name.clone()).into_iter().collect(); - - ret.additional_search_parameters_provided = q.is_some() - || vector.is_some() - || filter.is_some() - || *matching_strategy != MatchingStrategy::default() - || attributes_to_search_on.is_some() - || hybrid.is_some() - || ranking_score_threshold.is_some() - || locales.is_some(); - - ret - } - - pub fn succeed(&mut self, result: &FacetSearchResult) { - let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; - self.total_succeeded = self.total_succeeded.saturating_add(1); - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [FacetSearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - facet_names, - additional_search_parameters_provided, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); - - // facet_names - for facet_name in facet_names.into_iter() { - self.facet_names.insert(facet_name); - } - - // additional_search_parameters_provided - self.additional_search_parameters_provided |= additional_search_parameters_provided; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - facet_names, - additional_search_parameters_provided, - } = self; - - if total_received == 0 { - None - } else { - // the index of the 99th percentage of value - let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th as usize); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "facets": { - "total_distinct_facet_count": facet_names.len(), - "additional_search_parameters_provided": additional_search_parameters_provided, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct DocumentsAggregator { - timestamp: Option, - - // set to true when at least one request was received - updated: bool, - - // context - user_agents: HashSet, - - content_types: HashSet, - primary_keys: HashSet, - index_creation: bool, -} - -impl DocumentsAggregator { - pub fn from_query( - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let UpdateDocumentsQuery { primary_key, csv_delimiter: _ } = documents_query; - - let mut primary_keys = HashSet::new(); - if let Some(primary_key) = primary_key.clone() { - primary_keys.insert(primary_key); - } - - let mut content_types = HashSet::new(); - let content_type = request - .headers() - .get(CONTENT_TYPE) - .and_then(|s| s.to_str().ok()) - .unwrap_or("unknown") - .to_string(); - content_types.insert(content_type); - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - updated: true, - user_agents: extract_user_agents(request).into_iter().collect(), - content_types, - primary_keys, - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - self.updated |= updated; - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - for primary_key in primary_keys { - self.primary_keys.insert(primary_key); - } - for content_type in content_types { - self.content_types.insert(content_type); - } - self.index_creation |= index_creation; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - self; - - if !updated { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "payload_type": content_types, - "primary_key": primary_keys, - "index_creation": index_creation, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct EditDocumentsByFunctionAggregator { - timestamp: Option, - - // Set to true if at least one request was filtered - filtered: bool, - // Set to true if at least one request contained a context - with_context: bool, - - // context - user_agents: HashSet, - - index_creation: bool, -} - -impl EditDocumentsByFunctionAggregator { - pub fn from_query( - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let DocumentEditionByFunction { filter, context, function: _ } = documents_query; - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - filtered: filter.is_some(), - with_context: context.is_some(), - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.index_creation |= index_creation; - self.filtered |= filtered; - self.with_context |= with_context; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = self; - - let properties = json!({ - "user-agent": user_agents, - "filtered": filtered, - "with_context": with_context, - "index_creation": index_creation, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsDeletionAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - per_document_id: bool, - clear_all: bool, - per_batch: bool, - per_filter: bool, -} - -impl DocumentsDeletionAggregator { - pub fn from_query(kind: DocumentDeletionKind, request: &HttpRequest) -> Self { - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(kind, DocumentDeletionKind::PerDocumentId), - clear_all: matches!(kind, DocumentDeletionKind::ClearAll), - per_batch: matches!(kind, DocumentDeletionKind::PerBatch), - per_filter: matches!(kind, DocumentDeletionKind::PerFilter), - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - clear_all, - per_batch, - per_filter, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.clear_all |= clear_all; - self.per_batch |= per_batch; - self.per_filter |= per_filter; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsFetchAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - - // a call on ../documents/:doc_id - per_document_id: bool, - // if a filter was used - per_filter: bool, - - #[serde(rename = "vector.retrieve_vectors")] - retrieve_vectors: bool, - - // pagination - #[serde(rename = "pagination.max_limit")] - max_limit: usize, - #[serde(rename = "pagination.max_offset")] - max_offset: usize, -} - -impl DocumentsFetchAggregator { - pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset, retrieve_vectors) = match query { - DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), - DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { - (*limit, *offset, *retrieve_vectors) - } - }; - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), - per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), - max_limit: limit, - max_offset: offset, - retrieve_vectors, - } - } - - /// Aggregate one [DocumentsFetchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - per_filter, - max_limit, - max_offset, - retrieve_vectors, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.per_filter |= per_filter; - - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - self.retrieve_vectors |= retrieve_vectors; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - -#[derive(Default)] -pub struct SimilarAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // Whether a non-default embedder was specified - retrieve_vectors: bool, - - // pagination - max_limit: usize, - max_offset: usize, - - // formatting - max_attributes_to_retrieve: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, -} - -impl SimilarAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SimilarQuery, request: &HttpRequest) -> Self { - let SimilarQuery { - id: _, - embedder: _, - offset, - limit, - attributes_to_retrieve: _, - retrieve_vectors, - show_ranking_score, - show_ranking_score_details, - filter, - ranking_score_threshold, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - ret.max_limit = *limit; - ret.max_offset = *offset; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - ret.retrieve_vectors = *retrieve_vectors; - - ret - } - - pub fn succeed(&mut self, result: &SimilarResult) { - let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [SimilarAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - self.retrieve_vectors |= retrieve_vectors; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - } = self; - - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "vector": { - "retrieve_vectors": retrieve_vectors, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index b24f18fae..633ad2776 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -120,7 +120,7 @@ pub fn create_app( search_queue: Data, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, enable_dashboard: bool, ) -> actix_web::App< impl ServiceFactory< @@ -473,14 +473,14 @@ pub fn configure_data( search_queue: Data, opt: &Opt, (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, ) { let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize; config .app_data(index_scheduler) .app_data(auth) .app_data(search_queue) - .app_data(web::Data::from(analytics)) + .app_data(analytics) .app_data(web::Data::new(logs_route)) .app_data(web::Data::new(logs_stderr)) .app_data(web::Data::new(opt.clone())) diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index b66bfc5b8..c0652bf1e 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::thread::available_parallelism; +use std::time::Duration; use actix_web::http::KeepAlive; use actix_web::web::Data; @@ -123,19 +124,12 @@ async fn try_main() -> anyhow::Result<()> { let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?; - #[cfg(all(not(debug_assertions), feature = "analytics"))] - let analytics = if !opt.no_analytics { - analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone()) - .await - } else { - analytics::MockAnalytics::new(&opt) - }; - #[cfg(any(debug_assertions, not(feature = "analytics")))] - let analytics = analytics::MockAnalytics::new(&opt); + let analytics = + analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await; print_launch_resume(&opt, analytics.clone(), config_read_from); - run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?; + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) } @@ -145,16 +139,23 @@ async fn run_http( auth_controller: Arc, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Arc, ) -> anyhow::Result<()> { let enable_dashboard = &opt.env == "development"; let opt_clone = opt.clone(); let index_scheduler = Data::from(index_scheduler); let auth_controller = Data::from(auth_controller); + let analytics = Data::from(analytics); let search_queue = SearchQueue::new( opt.experimental_search_queue_size, - available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()), - ); + available_parallelism() + .unwrap_or(NonZeroUsize::new(2).unwrap()) + .checked_mul(opt.experimental_nb_searches_per_core) + .unwrap_or(NonZeroUsize::MAX), + ) + .with_time_to_abort(Duration::from_secs( + usize::from(opt.experimental_drop_search_after) as u64 + )); let search_queue = Data::new(search_queue); let http_server = HttpServer::new(move || { @@ -180,11 +181,7 @@ async fn run_http( Ok(()) } -pub fn print_launch_resume( - opt: &Opt, - analytics: Arc, - config_read_from: Option, -) { +pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option) { let build_info = build_info::BuildInfo::from_build(); let protocol = @@ -226,7 +223,6 @@ pub fn print_launch_resume( eprintln!("Prototype:\t\t{:?}", prototype); } - #[cfg(all(not(debug_assertions), feature = "analytics"))] { if !opt.no_analytics { eprintln!( diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 3799bdcb7..7e87a5a2c 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -2,7 +2,7 @@ use std::env::VarError; use std::ffi::OsStr; use std::fmt::Display; use std::io::{BufReader, Read}; -use std::num::ParseIntError; +use std::num::{NonZeroUsize, ParseIntError}; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; @@ -29,7 +29,6 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; -#[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH"; @@ -55,6 +54,8 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE"; +const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER"; +const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = @@ -208,7 +209,6 @@ pub struct Opt { /// Meilisearch automatically collects data from all instances that do not opt out using this flag. /// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted /// at any time. - #[cfg(feature = "analytics")] #[serde(default)] // we can't send true #[clap(long, env = MEILI_NO_ANALYTICS)] pub no_analytics: bool, @@ -357,10 +357,26 @@ pub struct Opt { /// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the /// queue is full it starts returning HTTP 503, Service Unavailable. /// The default value is 1000. - #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)] - #[serde(default)] + #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())] + #[serde(default = "default_experimental_search_queue_size")] pub experimental_search_queue_size: usize, + /// Experimental drop search after. For more information, see: + /// + /// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it. + /// The default value is 60. + #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] + #[serde(default = "default_drop_search_after")] + pub experimental_drop_search_after: NonZeroUsize, + + /// Experimental number of searches per core. For more information, see: + /// + /// Lets you customize how many search requests can run on each core concurrently. + /// The default value is 4. + #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] + #[serde(default = "default_nb_searches_per_core")] + pub experimental_nb_searches_per_core: NonZeroUsize, + /// Experimental logs mode feature. For more information, see: /// /// Change the mode of the logs on the console. @@ -407,7 +423,6 @@ pub struct Opt { impl Opt { /// Whether analytics should be enabled or not. - #[cfg(all(not(debug_assertions), feature = "analytics"))] pub fn analytics(&self) -> bool { !self.no_analytics } @@ -487,11 +502,12 @@ impl Opt { ignore_missing_dump: _, ignore_dump_if_db_exists: _, config_file_path: _, - #[cfg(feature = "analytics")] no_analytics, experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_enable_logs_route, experimental_replication_parameters, @@ -513,10 +529,7 @@ impl Opt { ); } - #[cfg(feature = "analytics")] - { - export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); - } + export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); export_to_env_if_not_present( MEILI_HTTP_PAYLOAD_SIZE_LIMIT, http_payload_size_limit.to_string(), @@ -559,6 +572,14 @@ impl Opt { MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, experimental_search_queue_size.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, + experimental_drop_search_after.to_string(), + ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, + experimental_nb_searches_per_core.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_LOGS_MODE, experimental_logs_mode.to_string(), @@ -890,6 +911,18 @@ fn default_dump_dir() -> PathBuf { PathBuf::from(DEFAULT_DUMP_DIR) } +fn default_experimental_search_queue_size() -> usize { + 1000 +} + +fn default_drop_search_after() -> NonZeroUsize { + NonZeroUsize::new(60).unwrap() +} + +fn default_nb_searches_per_core() -> NonZeroUsize { + NonZeroUsize::new(4).unwrap() +} + /// Indicates if a snapshot was scheduled, and if yes with which interval. #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)] pub enum ScheduleSnapshot { diff --git a/crates/meilisearch/src/routes/dump.rs b/crates/meilisearch/src/routes/dump.rs index 7f3cd06a5..c78dc4dad 100644 --- a/crates/meilisearch/src/routes/dump.rs +++ b/crates/meilisearch/src/routes/dump.rs @@ -4,7 +4,6 @@ use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -18,14 +17,16 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); } +crate::empty_analytics!(DumpAnalytics, "Dump Created"); + pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Dump Created".to_string(), json!({}), Some(&req)); + analytics.publish(DumpAnalytics::default(), &req); let task = KindWithContent::DumpCreation { keys: auth_controller.list_keys()?, diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index bc656bdbb..5d93adc02 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -6,10 +6,10 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; -use serde_json::json; +use serde::Serialize; use tracing::debug; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; @@ -17,7 +17,7 @@ use crate::extractors::sequential_extractor::SeqHandler; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") - .route(web::get().to(SeqHandler(get_features))) + .route(web::get().to(get_features)) .route(web::patch().to(SeqHandler(patch_features))), ); } @@ -27,12 +27,9 @@ async fn get_features( ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, Data, >, - req: HttpRequest, - analytics: Data, ) -> HttpResponse { let features = index_scheduler.features(); - analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req)); let features = features.runtime_features(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) @@ -53,6 +50,35 @@ pub struct RuntimeTogglableFeatures { pub contains_filter: Option, } +#[derive(Serialize)] +pub struct PatchExperimentalFeatureAnalytics { + vector_store: bool, + metrics: bool, + logs_route: bool, + edit_documents_by_function: bool, + contains_filter: bool, +} + +impl Aggregate for PatchExperimentalFeatureAnalytics { + fn event_name(&self) -> &'static str { + "Experimental features Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + vector_store: new.vector_store, + metrics: new.metrics, + logs_route: new.logs_route, + edit_documents_by_function: new.edit_documents_by_function, + contains_filter: new.contains_filter, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + async fn patch_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>, @@ -60,7 +86,7 @@ async fn patch_features( >, new_features: AwebJson, req: HttpRequest, - analytics: Data, + analytics: Data, ) -> Result { let features = index_scheduler.features(); debug!(parameters = ?new_features, "Patch features"); @@ -89,15 +115,14 @@ async fn patch_features( } = new_features; analytics.publish( - "Experimental features Updated".to_string(), - json!({ - "vector_store": vector_store, - "metrics": metrics, - "logs_route": logs_route, - "edit_documents_by_function": edit_documents_by_function, - "contains_filter": contains_filter, - }), - Some(&req), + PatchExperimentalFeatureAnalytics { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + }, + &req, ); index_scheduler.put_runtime_features(new_features)?; debug!(returns = ?new_features, "Patch features"); diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 85cf33c54..47f73ef42 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::io::ErrorKind; +use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -23,14 +25,14 @@ use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tracing::debug; -use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind}; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::error::MeilisearchHttpError; use crate::error::PayloadError::ReceivePayload; use crate::extractors::authentication::policies::*; @@ -41,7 +43,7 @@ use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; use crate::search::{parse_filter, RetrieveVectors}; -use crate::Opt; +use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -100,12 +102,84 @@ pub struct GetDocument { retrieve_vectors: Param, } +aggregate_methods!( + DocumentsGET => "Documents Fetched GET", + DocumentsPOST => "Documents Fetched POST", +); + +#[derive(Serialize)] +pub struct DocumentsFetchAggregator { + // a call on ../documents/:doc_id + per_document_id: bool, + // if a filter was used + per_filter: bool, + + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + + // pagination + #[serde(rename = "pagination.max_limit")] + max_limit: usize, + #[serde(rename = "pagination.max_offset")] + max_offset: usize, + + marker: std::marker::PhantomData, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum DocumentFetchKind { + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, +} + +impl DocumentsFetchAggregator { + pub fn from_query(query: &DocumentFetchKind) -> Self { + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } + }; + + Self { + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), + per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), + max_limit: limit, + max_offset: offset, + retrieve_vectors, + + marker: PhantomData, + } + } +} + +impl Aggregate for DocumentsFetchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + per_document_id: self.per_document_id | new.per_document_id, + per_filter: self.per_filter | new.per_filter, + retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, + max_limit: self.max_limit.max(new.max_limit), + max_offset: self.max_offset.max(new.max_offset), + marker: PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn get_document( index_scheduler: GuardedData, Data>, document_param: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = document_param.into_inner(); debug!(parameters = ?params, "Get document"); @@ -117,8 +191,15 @@ pub async fn get_document( let features = index_scheduler.features(); let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; - analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, + analytics.publish( + DocumentsFetchAggregator:: { + retrieve_vectors: param_retrieve_vectors.0, + per_document_id: true, + per_filter: false, + max_limit: 0, + max_offset: 0, + marker: PhantomData, + }, &req, ); @@ -129,17 +210,52 @@ pub async fn get_document( Ok(HttpResponse::Ok().json(document)) } +#[derive(Serialize)] +pub struct DocumentsDeletionAggregator { + per_document_id: bool, + clear_all: bool, + per_batch: bool, + per_filter: bool, +} + +impl Aggregate for DocumentsDeletionAggregator { + fn event_name(&self) -> &'static str { + "Documents Deleted" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + per_document_id: self.per_document_id | new.per_document_id, + clear_all: self.clear_all | new.clear_all, + per_batch: self.per_batch | new.per_batch, + per_filter: self.per_filter | new.per_filter, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = path.into_inner(); let index_uid = IndexUid::try_from(index_uid)?; - analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_document_id: true, + clear_all: false, + per_batch: false, + per_filter: false, + }, + &req, + ); let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), @@ -190,17 +306,19 @@ pub async fn documents_by_query_post( index_uid: web::Path, body: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); - analytics.post_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: body.filter.is_some(), - limit: body.limit, - offset: body.offset, + analytics.publish( + DocumentsFetchAggregator:: { + per_filter: body.filter.is_some(), retrieve_vectors: body.retrieve_vectors, + max_limit: body.limit, + max_offset: body.offset, + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -213,7 +331,7 @@ pub async fn get_documents( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Get documents GET"); @@ -235,12 +353,14 @@ pub async fn get_documents( filter, }; - analytics.get_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: query.filter.is_some(), - limit: query.limit, - offset: query.offset, + analytics.publish( + DocumentsFetchAggregator:: { + per_filter: query.filter.is_some(), retrieve_vectors: query.retrieve_vectors, + max_limit: query.limit, + max_offset: query.offset, + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -298,6 +418,39 @@ fn from_char_csv_delimiter( } } +aggregate_methods!( + Replaced => "Documents Added", + Updated => "Documents Updated", +); + +#[derive(Serialize)] +pub struct DocumentsAggregator { + payload_types: HashSet, + primary_key: HashSet, + index_creation: bool, + #[serde(skip)] + method: PhantomData, +} + +impl Aggregate for DocumentsAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + payload_types: self.payload_types.union(&new.payload_types).cloned().collect(), + primary_key: self.primary_key.union(&new.primary_key).cloned().collect(), + index_creation: self.index_creation | new.index_creation, + method: PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(self).unwrap_or_default() + } +} + pub async fn replace_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -305,16 +458,32 @@ pub async fn replace_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; debug!(parameters = ?params, "Replace documents"); let params = params.into_inner(); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, &req, ); @@ -346,16 +515,32 @@ pub async fn update_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let params = params.into_inner(); debug!(parameters = ?params, "Update documents"); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, &req, ); @@ -524,12 +709,20 @@ pub async fn delete_documents_batch( body: web::Json>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by batch"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::PerBatch, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_batch: true, + per_document_id: false, + clear_all: false, + per_filter: false, + }, + &req, + ); let ids = body .iter() @@ -562,14 +755,22 @@ pub async fn delete_documents_by_filter( body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by filter"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = index_uid.into_inner(); let filter = body.into_inner().filter; - analytics.delete_documents(DocumentDeletionKind::PerFilter, &req); + analytics.publish( + DocumentsDeletionAggregator { + per_filter: true, + per_document_id: false, + clear_all: false, + per_batch: false, + }, + &req, + ); // we ensure the filter is well formed before enqueuing it crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())? @@ -599,13 +800,41 @@ pub struct DocumentEditionByFunction { pub function: String, } +#[derive(Serialize)] +struct EditDocumentsByFunctionAggregator { + // Set to true if at least one request was filtered + filtered: bool, + // Set to true if at least one request contained a context + with_context: bool, + + index_creation: bool, +} + +impl Aggregate for EditDocumentsByFunctionAggregator { + fn event_name(&self) -> &'static str { + "Documents Edited By Function" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + filtered: self.filtered | new.filtered, + with_context: self.with_context | new.with_context, + index_creation: self.index_creation | new.index_creation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn edit_documents_by_function( index_scheduler: GuardedData, Data>, index_uid: web::Path, params: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Edit documents by function"); @@ -617,9 +846,12 @@ pub async fn edit_documents_by_function( let index_uid = index_uid.into_inner(); let params = params.into_inner(); - analytics.update_documents_by_function( - ¶ms, - index_scheduler.index(&index_uid).is_err(), + analytics.publish( + EditDocumentsByFunctionAggregator { + filtered: params.filter.is_some(), + with_context: params.context.is_some(), + index_creation: index_scheduler.index(&index_uid).is_err(), + }, &req, ); @@ -670,10 +902,18 @@ pub async fn clear_all_documents( index_uid: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); + analytics.publish( + DocumentsDeletionAggregator { + clear_all: true, + per_document_id: false, + per_batch: false, + per_filter: false, + }, + &req, + ); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let uid = get_task_id(&req, &opt)?; diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 1df80711d..99a4a4f28 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -1,3 +1,5 @@ +use std::collections::{BinaryHeap, HashSet}; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -10,14 +12,15 @@ use meilisearch_types::locales::Locale; use serde_json::Value; use tracing::debug; -use crate::analytics::{Analytics, FacetSearchAggregator}; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy, + RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; @@ -53,20 +56,122 @@ pub struct FacetSearchQuery { pub locales: Option>, } +#[derive(Default)] +pub struct FacetSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // The set of all facetNames that were used + facet_names: HashSet, + + // As there been any other parameter than the facetName or facetQuery ones? + additional_search_parameters_provided: bool, +} + +impl FacetSearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &FacetSearchQuery) -> Self { + let FacetSearchQuery { + facet_query: _, + facet_name, + vector, + q, + filter, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + Self { + total_received: 1, + facet_names: Some(facet_name.clone()).into_iter().collect(), + additional_search_parameters_provided: q.is_some() + || vector.is_some() + || filter.is_some() + || *matching_strategy != MatchingStrategy::default() + || attributes_to_search_on.is_some() + || hybrid.is_some() + || ranking_score_threshold.is_some() + || locales.is_some(), + ..Default::default() + } + } + + pub fn succeed(&mut self, result: &FacetSearchResult) { + let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; + self.total_succeeded = 1; + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for FacetSearchAggregator { + fn event_name(&self) -> &'static str { + "Facet Searched POST" + } + + fn aggregate(mut self: Box, new: Box) -> Box { + for time in new.time_spent { + self.time_spent.push(time); + } + + Box::new(Self { + total_received: self.total_received.saturating_add(new.total_received), + total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded), + time_spent: self.time_spent, + facet_names: self.facet_names.union(&new.facet_names).cloned().collect(), + additional_search_parameters_provided: self.additional_search_parameters_provided + | new.additional_search_parameters_provided, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + facet_names, + additional_search_parameters_provided, + } = *self; + // the index of the 99th percentage of value + let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th as usize); + + serde_json::json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "facets": { + "total_distinct_facet_count": facet_names.len(), + "additional_search_parameters_provided": additional_search_parameters_provided, + }, + }) + } +} + pub async fn search( index_scheduler: GuardedData, Data>, search_queue: Data, index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.into_inner(); debug!(parameters = ?query, "Facet search"); - let mut aggregate = FacetSearchAggregator::from_query(&query, &req); + let mut aggregate = FacetSearchAggregator::from_query(&query); let facet_query = query.facet_query.clone(); let facet_name = query.facet_name.clone(); @@ -100,7 +205,7 @@ pub async fn search( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.post_facet_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 35b747ccf..7d073ec5f 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeSet; use std::convert::Infallible; use actix_web::web::Data; @@ -13,12 +14,11 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::{self, FieldDistribution, Index}; use meilisearch_types::tasks::KindWithContent; use serde::Serialize; -use serde_json::json; use time::OffsetDateTime; use tracing::debug; use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; @@ -28,8 +28,11 @@ use crate::Opt; pub mod documents; pub mod facet_search; pub mod search; +mod search_analytics; pub mod settings; +mod settings_analytics; pub mod similar; +mod similar_analytics; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( @@ -123,12 +126,31 @@ pub struct IndexCreateRequest { primary_key: Option, } +#[derive(Serialize)] +struct IndexCreatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexCreatedAggregate { + fn event_name(&self) -> &'static str { + "Index Created" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Create index"); let IndexCreateRequest { primary_key, uid } = body.into_inner(); @@ -136,9 +158,8 @@ pub async fn create_index( let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid); if allow_index_creation { analytics.publish( - "Index Created".to_string(), - json!({ "primary_key": primary_key }), - Some(&req), + IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() }, + &req, ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; @@ -194,21 +215,38 @@ pub async fn get_index( Ok(HttpResponse::Ok().json(index_view)) } +#[derive(Serialize)] +struct IndexUpdatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexUpdatedAggregate { + fn event_name(&self) -> &'static str { + "Index Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} pub async fn update_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Update index"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let body = body.into_inner(); analytics.publish( - "Index Updated".to_string(), - json!({ "primary_key": body.primary_key }), - Some(&req), + IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() }, + &req, ); let task = KindWithContent::IndexUpdate { diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 6a8eee521..2f5cb4a36 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -13,12 +13,13 @@ use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; -use crate::analytics::{Analytics, SearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST}; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, @@ -225,7 +226,7 @@ pub async fn search_with_url_query( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Search get"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -237,7 +238,7 @@ pub async fn search_with_url_query( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); @@ -254,7 +255,7 @@ pub async fn search_with_url_query( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.get_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; @@ -268,7 +269,7 @@ pub async fn search_with_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -280,7 +281,7 @@ pub async fn search_with_post( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; @@ -302,7 +303,7 @@ pub async fn search_with_post( MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } } - analytics.post_search(aggregate); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs new file mode 100644 index 000000000..b16e2636e --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -0,0 +1,483 @@ +use std::collections::{BTreeSet, BinaryHeap, HashMap}; + +use meilisearch_types::locales::Locale; +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; + +use crate::aggregate_methods; +use crate::analytics::{Aggregate, AggregateMethod}; +use crate::search::{ + SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEMANTIC_RATIO, +}; + +aggregate_methods!( + SearchGET => "Documents Searched GET", + SearchPOST => "Documents Searched POST", +); + +#[derive(Default)] +pub struct SearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + total_degraded: usize, + total_used_negative_operator: usize, + time_spent: BinaryHeap, + + // sort + sort_with_geo_point: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + sort_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + sort_total_number_of_criteria: usize, + + // distinct + distinct: bool, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // attributes_to_search_on + // every time a search is done using attributes_to_search_on + attributes_to_search_on_total_number_of_uses: usize, + + // q + // The maximum number of terms in a q request + max_terms_number: usize, + + // vector + // The maximum number of floats in a vector request + max_vector_size: usize, + // Whether the semantic ratio passed to a hybrid search equals the default ratio. + semantic_ratio: bool, + hybrid: bool, + retrieve_vectors: bool, + + // every time a search is done, we increment the counter linked to the used settings + matching_strategy: HashMap, + + // List of the unique Locales passed as parameter + locales: BTreeSet, + + // pagination + max_limit: usize, + max_offset: usize, + finite_pagination: usize, + + // formatting + max_attributes_to_retrieve: usize, + max_attributes_to_highlight: usize, + highlight_pre_tag: bool, + highlight_post_tag: bool, + max_attributes_to_crop: usize, + crop_marker: bool, + show_matches_position: bool, + crop_length: bool, + + // facets + facets_sum_of_terms: usize, + facets_total_number_of_facets: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SearchQuery) -> Self { + let SearchQuery { + q, + vector, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve: _, + retrieve_vectors, + attributes_to_crop: _, + crop_length, + attributes_to_highlight: _, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets: _, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref sort) = sort { + ret.sort_total_number_of_criteria = 1; + ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); + ret.sort_sum_of_criteria_terms = sort.len(); + } + + ret.distinct = distinct.is_some(); + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + // attributes_to_search_on + if attributes_to_search_on.is_some() { + ret.attributes_to_search_on_total_number_of_uses = 1; + } + + if let Some(ref q) = q { + ret.max_terms_number = q.split_whitespace().count(); + } + + if let Some(ref vector) = vector { + ret.max_vector_size = vector.len(); + } + ret.retrieve_vectors |= retrieve_vectors; + + if query.is_finite_pagination() { + let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); + ret.max_limit = limit; + ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; + ret.finite_pagination = 1; + } else { + ret.max_limit = *limit; + ret.max_offset = *offset; + ret.finite_pagination = 0; + } + + ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); + + if let Some(locales) = locales { + ret.locales = locales.iter().copied().collect(); + } + + ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); + ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); + ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); + ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); + ret.show_matches_position = *show_matches_position; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + if let Some(hybrid) = hybrid { + ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); + ret.hybrid = true; + } + + ret + } + + pub fn succeed(&mut self, result: &SearchResult) { + let SearchResult { + hits: _, + query: _, + processing_time_ms, + hits_info: _, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + degraded, + used_negative_operator, + } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + if *degraded { + self.total_degraded = self.total_degraded.saturating_add(1); + } + if *used_negative_operator { + self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); + } + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SearchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + mut locales, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_degraded = self.total_degraded.saturating_add(total_degraded); + self.total_used_negative_operator = + self.total_used_negative_operator.saturating_add(total_used_negative_operator); + self.time_spent.append(&mut time_spent); + + // sort + self.sort_with_geo_point |= sort_with_geo_point; + self.sort_sum_of_criteria_terms = + self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); + self.sort_total_number_of_criteria = + self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + + // distinct + self.distinct |= distinct; + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + // attributes_to_search_on + self.attributes_to_search_on_total_number_of_uses = self + .attributes_to_search_on_total_number_of_uses + .saturating_add(attributes_to_search_on_total_number_of_uses); + + // q + self.max_terms_number = self.max_terms_number.max(max_terms_number); + + // vector + self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; + self.semantic_ratio |= semantic_ratio; + self.hybrid |= hybrid; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + self.finite_pagination += finite_pagination; + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + self.max_attributes_to_highlight = + self.max_attributes_to_highlight.max(max_attributes_to_highlight); + self.highlight_pre_tag |= highlight_pre_tag; + self.highlight_post_tag |= highlight_post_tag; + self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); + self.crop_marker |= crop_marker; + self.show_matches_position |= show_matches_position; + self.crop_length |= crop_length; + + // facets + self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); + self.facets_total_number_of_facets = + self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); + + // matching strategy + for (key, value) in matching_strategy.into_iter() { + let matching_strategy = self.matching_strategy.entry(key).or_insert(0); + *matching_strategy = matching_strategy.saturating_add(value); + } + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + // locales + self.locales.append(&mut locales); + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + locales, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, + }, + "sort": { + "with_geoPoint": sort_with_geo_point, + "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), + }, + "distinct": distinct, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "attributes_to_search_on": { + "total_number_of_uses": attributes_to_search_on_total_number_of_uses, + }, + "q": { + "max_terms_number": max_terms_number, + }, + "vector": { + "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, + }, + "hybrid": { + "enabled": hybrid, + "semantic_ratio": semantic_ratio, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + "max_attributes_to_highlight": max_attributes_to_highlight, + "highlight_pre_tag": highlight_pre_tag, + "highlight_post_tag": highlight_post_tag, + "max_attributes_to_crop": max_attributes_to_crop, + "crop_marker": crop_marker, + "show_matches_position": show_matches_position, + "crop_length": crop_length, + }, + "facets": { + "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), + }, + "matching_strategy": { + "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "locales": locales, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + }, + }) + } +} diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index aaf8673d0..a9d8d3053 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -4,14 +4,13 @@ use deserr::actix_web::AwebJson; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; -use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::Setting; -use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked}; +use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; +use super::settings_analytics::*; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; @@ -20,7 +19,7 @@ use crate::Opt; #[macro_export] macro_rules! make_setting_route { - ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => { + ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { pub mod $attr { use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse, Resource}; @@ -80,7 +79,7 @@ macro_rules! make_setting_route { body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, opt: web::Data, - $analytics_var: web::Data, + analytics: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -88,7 +87,10 @@ macro_rules! make_setting_route { debug!(parameters = ?body, "Update settings"); #[allow(clippy::redundant_closure_call)] - $analytics(&body, &req); + analytics.publish( + $crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(), + &req, + ); let new_settings = Settings { $attr: match body { @@ -160,21 +162,7 @@ make_setting_route!( >, filterable_attributes, "filterableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "FilterableAttributes Updated".to_string(), - json!({ - "filterable_attributes": { - "total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0), - "has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false), - } - }), - Some(req), - ); - } + FilterableAttributesAnalytics ); make_setting_route!( @@ -186,21 +174,7 @@ make_setting_route!( >, sortable_attributes, "sortableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "SortableAttributes Updated".to_string(), - json!({ - "sortable_attributes": { - "total": setting.as_ref().map(|sort| sort.len()), - "has_geo": setting.as_ref().map(|sort| sort.contains("_geo")), - }, - }), - Some(req), - ); - } + SortableAttributesAnalytics ); make_setting_route!( @@ -212,21 +186,7 @@ make_setting_route!( >, displayed_attributes, "displayedAttributes", - analytics, - |displayed: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "DisplayedAttributes Updated".to_string(), - json!({ - "displayed_attributes": { - "total": displayed.as_ref().map(|displayed| displayed.len()), - "with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - }), - Some(req), - ); - } + DisplayedAttributesAnalytics ); make_setting_route!( @@ -238,40 +198,7 @@ make_setting_route!( >, typo_tolerance, "typoTolerance", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "TypoTolerance Updated".to_string(), - json!({ - "typo_tolerance": { - "enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), - "disable_on_attributes": setting - .as_ref() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": setting - .as_ref() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - }), - Some(req), - ); - } + TypoToleranceAnalytics ); make_setting_route!( @@ -283,21 +210,7 @@ make_setting_route!( >, searchable_attributes, "searchableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "SearchableAttributes Updated".to_string(), - json!({ - "searchable_attributes": { - "total": setting.as_ref().map(|searchable| searchable.len()), - "with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - }), - Some(req), - ); - } + SearchableAttributesAnalytics ); make_setting_route!( @@ -309,20 +222,7 @@ make_setting_route!( >, stop_words, "stopWords", - analytics, - |stop_words: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "StopWords Updated".to_string(), - json!({ - "stop_words": { - "total": stop_words.as_ref().map(|stop_words| stop_words.len()), - }, - }), - Some(req), - ); - } + StopWordsAnalytics ); make_setting_route!( @@ -334,20 +234,7 @@ make_setting_route!( >, non_separator_tokens, "nonSeparatorTokens", - analytics, - |non_separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "nonSeparatorTokens Updated".to_string(), - json!({ - "non_separator_tokens": { - "total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()), - }, - }), - Some(req), - ); - } + NonSeparatorTokensAnalytics ); make_setting_route!( @@ -359,20 +246,7 @@ make_setting_route!( >, separator_tokens, "separatorTokens", - analytics, - |separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "separatorTokens Updated".to_string(), - json!({ - "separator_tokens": { - "total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()), - }, - }), - Some(req), - ); - } + SeparatorTokensAnalytics ); make_setting_route!( @@ -384,20 +258,7 @@ make_setting_route!( >, dictionary, "dictionary", - analytics, - |dictionary: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "dictionary Updated".to_string(), - json!({ - "dictionary": { - "total": dictionary.as_ref().map(|dictionary| dictionary.len()), - }, - }), - Some(req), - ); - } + DictionaryAnalytics ); make_setting_route!( @@ -409,20 +270,7 @@ make_setting_route!( >, synonyms, "synonyms", - analytics, - |synonyms: &Option>>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "Synonyms Updated".to_string(), - json!({ - "synonyms": { - "total": synonyms.as_ref().map(|synonyms| synonyms.len()), - }, - }), - Some(req), - ); - } + SynonymsAnalytics ); make_setting_route!( @@ -434,19 +282,7 @@ make_setting_route!( >, distinct_attribute, "distinctAttribute", - analytics, - |distinct: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "DistinctAttribute Updated".to_string(), - json!({ - "distinct_attribute": { - "set": distinct.is_some(), - } - }), - Some(req), - ); - } + DistinctAttributeAnalytics ); make_setting_route!( @@ -458,20 +294,7 @@ make_setting_route!( >, proximity_precision, "proximityPrecision", - analytics, - |precision: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "ProximityPrecision Updated".to_string(), - json!({ - "proximity_precision": { - "set": precision.is_some(), - "value": precision.unwrap_or_default(), - } - }), - Some(req), - ); - } + ProximityPrecisionAnalytics ); make_setting_route!( @@ -483,17 +306,7 @@ make_setting_route!( >, localized_attributes, "localizedAttributes", - analytics, - |rules: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "LocalizedAttributesRules Updated".to_string(), - json!({ - "locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()) - }), - Some(req), - ); - } + LocalesAnalytics ); make_setting_route!( @@ -505,26 +318,7 @@ make_setting_route!( >, ranking_rules, "rankingRules", - analytics, - |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "RankingRules Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))), - "typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))), - "proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))), - "attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))), - "sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))), - "exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))), - "values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - } - }), - Some(req), - ); - } + RankingRulesAnalytics ); make_setting_route!( @@ -536,25 +330,7 @@ make_setting_route!( >, faceting, "faceting", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - use meilisearch_types::facet_values_sort::FacetValuesSort; - - analytics.publish( - "Faceting Updated".to_string(), - json!({ - "faceting": { - "max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()), - "sort_facet_values_by_star_count": setting.as_ref().and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - }), - Some(req), - ); - } + FacetingAnalytics ); make_setting_route!( @@ -566,20 +342,7 @@ make_setting_route!( >, pagination, "pagination", - analytics, - |setting: &Option, req: &HttpRequest| { - use serde_json::json; - - analytics.publish( - "Pagination Updated".to_string(), - json!({ - "pagination": { - "max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()), - }, - }), - Some(req), - ); - } + PaginationAnalytics ); make_setting_route!( @@ -591,75 +354,9 @@ make_setting_route!( >, embedders, "embedders", - analytics, - |setting: &Option>>, req: &HttpRequest| { - - - analytics.publish( - "Embedders Updated".to_string(), - serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}), - Some(req), - ); - } + EmbeddersAnalytics ); -fn embedder_analytics( - setting: Option< - &std::collections::BTreeMap< - String, - Setting, - >, - >, -) -> serde_json::Value { - let mut sources = std::collections::HashSet::new(); - - if let Some(s) = &setting { - for source in s - .values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.source.set()) - { - use meilisearch_types::milli::vector::settings::EmbedderSource; - match source { - EmbedderSource::OpenAi => sources.insert("openAi"), - EmbedderSource::HuggingFace => sources.insert("huggingFace"), - EmbedderSource::UserProvided => sources.insert("userProvided"), - EmbedderSource::Ollama => sources.insert("ollama"), - EmbedderSource::Rest => sources.insert("rest"), - }; - } - }; - - let document_template_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.document_template.set().is_some()) - }); - - let document_template_max_bytes = setting.as_ref().and_then(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.document_template_max_bytes.set()) - .max() - }); - - let binary_quantization_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.binary_quantized.set().is_some()) - }); - - json!( - { - "total": setting.as_ref().map(|s| s.len()), - "sources": sources, - "document_template_used": document_template_used, - "document_template_max_bytes": document_template_max_bytes, - "binary_quantization_used": binary_quantization_used, - } - ) -} - make_setting_route!( "/search-cutoff-ms", put, @@ -669,14 +366,7 @@ make_setting_route!( >, search_cutoff_ms, "searchCutoffMs", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - "Search Cutoff Updated".to_string(), - serde_json::json!({"search_cutoff_ms": setting }), - Some(req), - ); - } + SearchCutoffMsAnalytics ); macro_rules! generate_configure { @@ -720,7 +410,7 @@ pub async fn update_all( body: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -729,104 +419,45 @@ pub async fn update_all( let new_settings = validate_settings(new_settings, &index_scheduler)?; analytics.publish( - "Settings Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))), - "typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))), - "proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))), - "attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))), - "sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))), - "exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))), - "values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - }, - "searchable_attributes": { - "total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()), - "with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - "displayed_attributes": { - "total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()), - "with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - "sortable_attributes": { - "total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()), - "has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")), - }, - "filterable_attributes": { - "total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()), - "has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")), - }, - "distinct_attribute": { - "set": new_settings.distinct_attribute.as_ref().set().is_some() - }, - "proximity_precision": { - "set": new_settings.proximity_precision.as_ref().set().is_some(), - "value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default() - }, - "typo_tolerance": { - "enabled": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.enabled.as_ref().set()) - .copied(), - "disable_on_attributes": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - "faceting": { - "max_values_per_facet": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.max_values_per_facet.as_ref().set()), - "sort_facet_values_by_star_count": new_settings.faceting - .as_ref() - .set() - .and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - "pagination": { - "max_total_hits": new_settings.pagination - .as_ref() - .set() - .and_then(|s| s.max_total_hits.as_ref().set()), - }, - "stop_words": { - "total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()), - }, - "synonyms": { - "total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()), - }, - "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), - "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), - "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()), - }), - Some(&req), + SettingsAnalytics { + ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()), + searchable_attributes: SearchableAttributesAnalytics::new( + new_settings.searchable_attributes.as_ref().set(), + ), + displayed_attributes: DisplayedAttributesAnalytics::new( + new_settings.displayed_attributes.as_ref().set(), + ), + sortable_attributes: SortableAttributesAnalytics::new( + new_settings.sortable_attributes.as_ref().set(), + ), + filterable_attributes: FilterableAttributesAnalytics::new( + new_settings.filterable_attributes.as_ref().set(), + ), + distinct_attribute: DistinctAttributeAnalytics::new( + new_settings.distinct_attribute.as_ref().set(), + ), + proximity_precision: ProximityPrecisionAnalytics::new( + new_settings.proximity_precision.as_ref().set(), + ), + typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()), + faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()), + pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()), + stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()), + synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()), + embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()), + search_cutoff_ms: SearchCutoffMsAnalytics::new( + new_settings.search_cutoff_ms.as_ref().set(), + ), + locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()), + dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()), + separator_tokens: SeparatorTokensAnalytics::new( + new_settings.separator_tokens.as_ref().set(), + ), + non_separator_tokens: NonSeparatorTokensAnalytics::new( + new_settings.non_separator_tokens.as_ref().set(), + ), + }, + &req, ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs new file mode 100644 index 000000000..32bddcbdd --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -0,0 +1,622 @@ +//! All the structures used to make the analytics on the settings works. +//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received +//! through the sub-settings route directly without any manipulation. +//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`. + +use std::collections::{BTreeMap, BTreeSet, HashSet}; + +use meilisearch_types::facet_values_sort::FacetValuesSort; +use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::settings::{ + FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings, +}; +use serde::Serialize; + +use crate::analytics::Aggregate; + +#[derive(Serialize, Default)] +pub struct SettingsAnalytics { + pub ranking_rules: RankingRulesAnalytics, + pub searchable_attributes: SearchableAttributesAnalytics, + pub displayed_attributes: DisplayedAttributesAnalytics, + pub sortable_attributes: SortableAttributesAnalytics, + pub filterable_attributes: FilterableAttributesAnalytics, + pub distinct_attribute: DistinctAttributeAnalytics, + pub proximity_precision: ProximityPrecisionAnalytics, + pub typo_tolerance: TypoToleranceAnalytics, + pub faceting: FacetingAnalytics, + pub pagination: PaginationAnalytics, + pub stop_words: StopWordsAnalytics, + pub synonyms: SynonymsAnalytics, + pub embedders: EmbeddersAnalytics, + pub search_cutoff_ms: SearchCutoffMsAnalytics, + pub locales: LocalesAnalytics, + pub dictionary: DictionaryAnalytics, + pub separator_tokens: SeparatorTokensAnalytics, + pub non_separator_tokens: NonSeparatorTokensAnalytics, +} + +impl Aggregate for SettingsAnalytics { + fn event_name(&self) -> &'static str { + "Settings Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + ranking_rules: RankingRulesAnalytics { + words_position: new + .ranking_rules + .words_position + .or(self.ranking_rules.words_position), + typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position), + proximity_position: new + .ranking_rules + .proximity_position + .or(self.ranking_rules.proximity_position), + attribute_position: new + .ranking_rules + .attribute_position + .or(self.ranking_rules.attribute_position), + sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position), + exactness_position: new + .ranking_rules + .exactness_position + .or(self.ranking_rules.exactness_position), + values: new.ranking_rules.values.or(self.ranking_rules.values), + }, + searchable_attributes: SearchableAttributesAnalytics { + total: new.searchable_attributes.total.or(self.searchable_attributes.total), + with_wildcard: new + .searchable_attributes + .with_wildcard + .or(self.searchable_attributes.with_wildcard), + }, + displayed_attributes: DisplayedAttributesAnalytics { + total: new.displayed_attributes.total.or(self.displayed_attributes.total), + with_wildcard: new + .displayed_attributes + .with_wildcard + .or(self.displayed_attributes.with_wildcard), + }, + sortable_attributes: SortableAttributesAnalytics { + total: new.sortable_attributes.total.or(self.sortable_attributes.total), + has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo), + }, + filterable_attributes: FilterableAttributesAnalytics { + total: new.filterable_attributes.total.or(self.filterable_attributes.total), + has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo), + }, + distinct_attribute: DistinctAttributeAnalytics { + set: self.distinct_attribute.set | new.distinct_attribute.set, + }, + proximity_precision: ProximityPrecisionAnalytics { + set: self.proximity_precision.set | new.proximity_precision.set, + value: new.proximity_precision.value.or(self.proximity_precision.value), + }, + typo_tolerance: TypoToleranceAnalytics { + enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled), + disable_on_attributes: new + .typo_tolerance + .disable_on_attributes + .or(self.typo_tolerance.disable_on_attributes), + disable_on_words: new + .typo_tolerance + .disable_on_words + .or(self.typo_tolerance.disable_on_words), + min_word_size_for_one_typo: new + .typo_tolerance + .min_word_size_for_one_typo + .or(self.typo_tolerance.min_word_size_for_one_typo), + min_word_size_for_two_typos: new + .typo_tolerance + .min_word_size_for_two_typos + .or(self.typo_tolerance.min_word_size_for_two_typos), + }, + faceting: FacetingAnalytics { + max_values_per_facet: new + .faceting + .max_values_per_facet + .or(self.faceting.max_values_per_facet), + sort_facet_values_by_star_count: new + .faceting + .sort_facet_values_by_star_count + .or(self.faceting.sort_facet_values_by_star_count), + sort_facet_values_by_total: new + .faceting + .sort_facet_values_by_total + .or(self.faceting.sort_facet_values_by_total), + }, + pagination: PaginationAnalytics { + max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits), + }, + stop_words: StopWordsAnalytics { + total: new.stop_words.total.or(self.stop_words.total), + }, + synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) }, + embedders: EmbeddersAnalytics { + total: new.embedders.total.or(self.embedders.total), + sources: match (self.embedders.sources, new.embedders.sources) { + (None, None) => None, + (Some(sources), None) | (None, Some(sources)) => Some(sources), + (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), + }, + document_template_used: match ( + self.embedders.document_template_used, + new.embedders.document_template_used, + ) { + (None, None) => None, + (Some(used), None) | (None, Some(used)) => Some(used), + (Some(this), Some(other)) => Some(this | other), + }, + document_template_max_bytes: match ( + self.embedders.document_template_max_bytes, + new.embedders.document_template_max_bytes, + ) { + (None, None) => None, + (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), + (Some(this), Some(other)) => Some(this.max(other)), + }, + binary_quantization_used: match ( + self.embedders.binary_quantization_used, + new.embedders.binary_quantization_used, + ) { + (None, None) => None, + (Some(bq), None) | (None, Some(bq)) => Some(bq), + (Some(this), Some(other)) => Some(this | other), + }, + }, + search_cutoff_ms: SearchCutoffMsAnalytics { + search_cutoff_ms: new + .search_cutoff_ms + .search_cutoff_ms + .or(self.search_cutoff_ms.search_cutoff_ms), + }, + locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) }, + dictionary: DictionaryAnalytics { + total: new.dictionary.total.or(self.dictionary.total), + }, + separator_tokens: SeparatorTokensAnalytics { + total: new.non_separator_tokens.total.or(self.separator_tokens.total), + }, + non_separator_tokens: NonSeparatorTokensAnalytics { + total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), + }, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + +#[derive(Serialize, Default)] +pub struct RankingRulesAnalytics { + pub words_position: Option, + pub typo_position: Option, + pub proximity_position: Option, + pub attribute_position: Option, + pub sort_position: Option, + pub exactness_position: Option, + pub values: Option, +} + +impl RankingRulesAnalytics { + pub fn new(rr: Option<&Vec>) -> Self { + RankingRulesAnalytics { + words_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) + }), + typo_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) + }), + proximity_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + }) + }), + attribute_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + }) + }), + sort_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) + }), + exactness_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + }) + }), + values: rr.as_ref().map(|rr| { + rr.iter() + .filter(|s| { + matches!( + s, + meilisearch_types::settings::RankingRuleView::Asc(_) + | meilisearch_types::settings::RankingRuleView::Desc(_) + ) + }) + .map(|x| x.to_string()) + .collect::>() + .join(", ") + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { ranking_rules: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SearchableAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl SearchableAttributesAnalytics { + pub fn new(setting: Option<&Vec>) -> Self { + Self { + total: setting.as_ref().map(|searchable| searchable.len()), + with_wildcard: setting + .as_ref() + .map(|searchable| searchable.iter().any(|searchable| searchable == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { searchable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DisplayedAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl DisplayedAttributesAnalytics { + pub fn new(displayed: Option<&Vec>) -> Self { + Self { + total: displayed.as_ref().map(|displayed| displayed.len()), + with_wildcard: displayed + .as_ref() + .map(|displayed| displayed.iter().any(|displayed| displayed == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { displayed_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SortableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl SortableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|sort| sort.len()), + has_geo: setting.as_ref().map(|sort| sort.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { sortable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FilterableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl FilterableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|filter| filter.len()), + has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { filterable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DistinctAttributeAnalytics { + pub set: bool, +} + +impl DistinctAttributeAnalytics { + pub fn new(distinct: Option<&String>) -> Self { + Self { set: distinct.is_some() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { distinct_attribute: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct ProximityPrecisionAnalytics { + pub set: bool, + pub value: Option, +} + +impl ProximityPrecisionAnalytics { + pub fn new(precision: Option<&ProximityPrecisionView>) -> Self { + Self { set: precision.is_some(), value: precision.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { proximity_precision: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct TypoToleranceAnalytics { + pub enabled: Option, + pub disable_on_attributes: Option, + pub disable_on_words: Option, + pub min_word_size_for_one_typo: Option, + pub min_word_size_for_two_typos: Option, +} + +impl TypoToleranceAnalytics { + pub fn new(setting: Option<&TypoSettings>) -> Self { + Self { + enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), + disable_on_attributes: setting + .as_ref() + .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), + disable_on_words: setting + .as_ref() + .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), + min_word_size_for_one_typo: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set())) + .flatten(), + min_word_size_for_two_typos: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set())) + .flatten(), + } + } + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { typo_tolerance: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FacetingAnalytics { + pub max_values_per_facet: Option, + pub sort_facet_values_by_star_count: Option, + pub sort_facet_values_by_total: Option, +} + +impl FacetingAnalytics { + pub fn new(setting: Option<&FacetingSettings>) -> Self { + Self { + max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()), + sort_facet_values_by_star_count: setting.as_ref().and_then(|s| { + s.sort_facet_values_by + .as_ref() + .set() + .map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) + }), + sort_facet_values_by_total: setting + .as_ref() + .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { faceting: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PaginationAnalytics { + pub max_total_hits: Option, +} + +impl PaginationAnalytics { + pub fn new(setting: Option<&PaginationSettings>) -> Self { + Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { pagination: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct StopWordsAnalytics { + pub total: Option, +} + +impl StopWordsAnalytics { + pub fn new(stop_words: Option<&BTreeSet>) -> Self { + Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { stop_words: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SynonymsAnalytics { + pub total: Option, +} + +impl SynonymsAnalytics { + pub fn new(synonyms: Option<&BTreeMap>>) -> Self { + Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { synonyms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct EmbeddersAnalytics { + // last + pub total: Option, + // Merge the sources + pub sources: Option>, + // |= + pub document_template_used: Option, + // max + pub document_template_max_bytes: Option, + // |= + pub binary_quantization_used: Option, +} + +impl EmbeddersAnalytics { + pub fn new(setting: Option<&BTreeMap>>) -> Self { + let mut sources = std::collections::HashSet::new(); + + if let Some(s) = &setting { + for source in s + .values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.source.set()) + { + use meilisearch_types::milli::vector::settings::EmbedderSource; + match source { + EmbedderSource::OpenAi => sources.insert("openAi".to_string()), + EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()), + EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), + EmbedderSource::Ollama => sources.insert("ollama".to_string()), + EmbedderSource::Rest => sources.insert("rest".to_string()), + }; + } + }; + + Self { + total: setting.as_ref().map(|s| s.len()), + sources: Some(sources), + document_template_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.document_template.set().is_some()) + }), + document_template_max_bytes: setting.as_ref().and_then(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.document_template_max_bytes.set()) + .max() + }), + binary_quantization_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.binary_quantized.set().is_some()) + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { embedders: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct SearchCutoffMsAnalytics { + pub search_cutoff_ms: Option, +} + +impl SearchCutoffMsAnalytics { + pub fn new(setting: Option<&u64>) -> Self { + Self { search_cutoff_ms: setting.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct LocalesAnalytics { + pub locales: Option>, +} + +impl LocalesAnalytics { + pub fn new(rules: Option<&Vec>) -> Self { + LocalesAnalytics { + locales: rules.as_ref().map(|rules| { + rules + .iter() + .flat_map(|rule| rule.locales.iter().cloned()) + .collect::>() + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { locales: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DictionaryAnalytics { + pub total: Option, +} + +impl DictionaryAnalytics { + pub fn new(dictionary: Option<&BTreeSet>) -> Self { + Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { dictionary: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SeparatorTokensAnalytics { + pub total: Option, +} + +impl SeparatorTokensAnalytics { + pub fn new(separator_tokens: Option<&BTreeSet>) -> Self { + Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { separator_tokens: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct NonSeparatorTokensAnalytics { + pub total: Option, +} + +impl NonSeparatorTokensAnalytics { + pub fn new(non_separator_tokens: Option<&BTreeSet>) -> Self { + Self { + total: non_separator_tokens + .as_ref() + .map(|non_separator_tokens| non_separator_tokens.len()), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { non_separator_tokens: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index f94a02987..79f42f0aa 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -13,9 +13,10 @@ use serde_json::Value; use tracing::debug; use super::ActionPolicy; -use crate::analytics::{Analytics, SimilarAggregator}; +use crate::analytics::Analytics; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST}; use crate::search::{ add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, @@ -34,13 +35,13 @@ pub async fn similar_get( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.0.try_into()?; - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); debug!(parameters = ?query, "Similar get"); @@ -49,7 +50,7 @@ pub async fn similar_get( if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.get_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; @@ -62,21 +63,21 @@ pub async fn similar_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.into_inner(); debug!(parameters = ?query, "Similar post"); - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); let similar = similar(index_scheduler, index_uid, query).await; if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.post_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; diff --git a/crates/meilisearch/src/routes/indexes/similar_analytics.rs b/crates/meilisearch/src/routes/indexes/similar_analytics.rs new file mode 100644 index 000000000..726839c3a --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/similar_analytics.rs @@ -0,0 +1,233 @@ +use std::collections::{BinaryHeap, HashMap}; + +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; + +use crate::aggregate_methods; +use crate::analytics::{Aggregate, AggregateMethod}; +use crate::search::{SimilarQuery, SimilarResult}; + +aggregate_methods!( + SimilarPOST => "Similar POST", + SimilarGET => "Similar GET", +); + +#[derive(Default)] +pub struct SimilarAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // Whether a non-default embedder was specified + retrieve_vectors: bool, + + // pagination + max_limit: usize, + max_offset: usize, + + // formatting + max_attributes_to_retrieve: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SimilarAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SimilarQuery) -> Self { + let SimilarQuery { + id: _, + embedder: _, + offset, + limit, + attributes_to_retrieve: _, + retrieve_vectors, + show_ranking_score, + show_ranking_score_details, + filter, + ranking_score_threshold, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + ret.max_limit = *limit; + ret.max_offset = *offset; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + ret.retrieve_vectors = *retrieve_vectors; + + ret + } + + pub fn succeed(&mut self, result: &SimilarResult) { + let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SimilarAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + /// Aggregate one [SimilarAggregator] into another. + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.time_spent.append(&mut time_spent); + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + self.retrieve_vectors |= retrieve_vectors; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + } + }) + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index c25aeee70..b7260ea08 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -25,6 +25,7 @@ pub mod indexes; mod logs; mod metrics; mod multi_search; +mod multi_search_analytics; mod snapshot; mod swap_indexes; pub mod tasks; diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index 5fcb868c6..f8b1bc6ee 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -9,7 +9,8 @@ use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; -use crate::analytics::{Analytics, MultiSearchAggregator}; +use super::multi_search_analytics::MultiSearchAggregator; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -35,7 +36,7 @@ pub async fn multi_search_with_post( search_queue: Data, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { // Since we don't want to process half of the search requests and then get a permit refused // we're going to get one permit for the whole duration of the multi-search request. @@ -43,7 +44,7 @@ pub async fn multi_search_with_post( let federated_search = params.into_inner(); - let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req); + let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search); let FederatedSearch { mut queries, federation } = federated_search; @@ -87,7 +88,7 @@ pub async fn multi_search_with_post( multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); HttpResponse::Ok().json(search_result??) } None => { @@ -149,7 +150,7 @@ pub async fn multi_search_with_post( if search_results.is_ok() { multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); let search_results = search_results.map_err(|(mut err, query_index)| { // Add the query index that failed as context for the error message. diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs new file mode 100644 index 000000000..3d07f471c --- /dev/null +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -0,0 +1,168 @@ +use std::collections::HashSet; + +use serde_json::json; + +use crate::analytics::Aggregate; +use crate::search::{FederatedSearch, SearchQueryWithIndex}; + +#[derive(Default)] +pub struct MultiSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + + // sum of the number of distinct indexes in each single request, use with total_received to compute an avg + total_distinct_index_count: usize, + // number of queries with a single index, use with total_received to compute a proportion + total_single_index: usize, + + // sum of the number of search queries in the requests, use with total_received to compute an average + total_search_count: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + + // federation + use_federation: bool, +} + +impl MultiSearchAggregator { + pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { + let use_federation = federated_search.federation.is_some(); + + let distinct_indexes: HashSet<_> = federated_search + .queries + .iter() + .map(|query| { + let query = &query; + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + let SearchQueryWithIndex { + index_uid, + federation_options: _, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } = query; + + index_uid.as_str() + }) + .collect(); + + let show_ranking_score = + federated_search.queries.iter().any(|query| query.show_ranking_score); + let show_ranking_score_details = + federated_search.queries.iter().any(|query| query.show_ranking_score_details); + + Self { + total_received: 1, + total_succeeded: 0, + total_distinct_index_count: distinct_indexes.len(), + total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, + total_search_count: federated_search.queries.len(), + show_ranking_score, + show_ranking_score_details, + use_federation, + } + } + + pub fn succeed(&mut self) { + self.total_succeeded = self.total_succeeded.saturating_add(1); + } +} + +impl Aggregate for MultiSearchAggregator { + fn event_name(&self) -> &'static str { + "Documents Searched by Multi-Search POST" + } + + /// Aggregate one [MultiSearchAggregator] into another. + fn aggregate(self: Box, new: Box) -> Box { + // write the aggregate in a way that will cause a compilation error if a field is added. + + // get ownership of self, replacing it by a default value. + let this = *self; + + let total_received = this.total_received.saturating_add(new.total_received); + let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); + let total_distinct_index_count = + this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_single_index = this.total_single_index.saturating_add(new.total_single_index); + let total_search_count = this.total_search_count.saturating_add(new.total_search_count); + let show_ranking_score = this.show_ranking_score || new.show_ranking_score; + let show_ranking_score_details = + this.show_ranking_score_details || new.show_ranking_score_details; + let use_federation = this.use_federation || new.use_federation; + + Box::new(Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + } = *self; + + json!({ + "requests": { + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "indexes": { + "total_single_index": total_single_index, + "total_distinct_index_count": total_distinct_index_count, + "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early + }, + "searches": { + "total_search_count": total_search_count, + "avg_search_count": (total_search_count as f64) / (total_received as f64), + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + }, + "federation": { + "use_federation": use_federation, + } + }) + } +} diff --git a/crates/meilisearch/src/routes/snapshot.rs b/crates/meilisearch/src/routes/snapshot.rs index 84673729f..cacbc41af 100644 --- a/crates/meilisearch/src/routes/snapshot.rs +++ b/crates/meilisearch/src/routes/snapshot.rs @@ -3,7 +3,6 @@ use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -17,13 +16,15 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); } +crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created"); + pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); + analytics.publish(SnapshotAnalytics::default(), &req); let task = KindWithContent::SnapshotCreation; let uid = get_task_id(&req, &opt)?; diff --git a/crates/meilisearch/src/routes/swap_indexes.rs b/crates/meilisearch/src/routes/swap_indexes.rs index 51a7b0707..9b8b67e63 100644 --- a/crates/meilisearch/src/routes/swap_indexes.rs +++ b/crates/meilisearch/src/routes/swap_indexes.rs @@ -8,10 +8,10 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; -use serde_json::json; +use serde::Serialize; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -29,21 +29,36 @@ pub struct SwapIndexesPayload { indexes: Vec, } +#[derive(Serialize)] +struct IndexSwappedAnalytics { + swap_operation_number: usize, +} + +impl Aggregate for IndexSwappedAnalytics { + fn event_name(&self) -> &'static str { + "Indexes Swapped" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + swap_operation_number: self.swap_operation_number.max(new.swap_operation_number), + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); - analytics.publish( - "Indexes Swapped".to_string(), - json!({ - "swap_operation_number": params.len(), - }), - Some(&req), - ); + analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req); let filters = index_scheduler.filters(); let mut swaps = vec![]; diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index 02f009ff7..95959d6d5 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -12,18 +12,17 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; -use serde_json::json; use time::format_description::well_known::Rfc3339; use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::Opt; +use crate::{aggregate_methods, Opt}; const DEFAULT_LIMIT: u32 = 20; @@ -158,12 +157,69 @@ impl TaskDeletionOrCancelationQuery { } } +aggregate_methods!( + CancelTasks => "Tasks Canceled", + DeleteTasks => "Tasks Deleted", +); + +#[derive(Serialize)] +struct TaskFilterAnalytics { + filtered_by_uid: bool, + filtered_by_index_uid: bool, + filtered_by_type: bool, + filtered_by_status: bool, + filtered_by_canceled_by: bool, + filtered_by_before_enqueued_at: bool, + filtered_by_after_enqueued_at: bool, + filtered_by_before_started_at: bool, + filtered_by_after_started_at: bool, + filtered_by_before_finished_at: bool, + filtered_by_after_finished_at: bool, + + #[serde(skip)] + marker: std::marker::PhantomData, +} + +impl Aggregate for TaskFilterAnalytics { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { + filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid, + filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid, + filtered_by_type: self.filtered_by_type | new.filtered_by_type, + filtered_by_status: self.filtered_by_status | new.filtered_by_status, + filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by, + filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at + | new.filtered_by_before_enqueued_at, + filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at + | new.filtered_by_after_enqueued_at, + filtered_by_before_started_at: self.filtered_by_before_started_at + | new.filtered_by_before_started_at, + filtered_by_after_started_at: self.filtered_by_after_started_at + | new.filtered_by_after_started_at, + filtered_by_before_finished_at: self.filtered_by_before_finished_at + | new.filtered_by_before_finished_at, + filtered_by_after_finished_at: self.filtered_by_after_finished_at + | new.filtered_by_after_finished_at, + + marker: std::marker::PhantomData, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + async fn cancel_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -172,21 +228,22 @@ async fn cancel_tasks( } analytics.publish( - "Tasks Canceled".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); let query = params.into_query(); @@ -214,7 +271,7 @@ async fn delete_tasks( params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -223,22 +280,24 @@ async fn delete_tasks( } analytics.publish( - "Tasks Deleted".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); + let query = params.into_query(); let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 66b6e56de..7832c1761 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1195,8 +1195,13 @@ impl<'a> HitMaker<'a> { let vectors_is_hidden = match (&displayed_ids, vectors_fid) { // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid (None, _) => false, - // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field - (Some(_), None) => true, + // vectors has no fid, so check its explicit name + (Some(_), None) => { + // unwrap as otherwise we'd go to the first one + let displayed_names = index.displayed_fields(rtxn)?.unwrap(); + !displayed_names + .contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME) + } // displayed_ids is a finit list, so hide if `_vectors` is not part of it (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), }; diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index 784067c2d..221333fd7 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -9,8 +9,7 @@ use urlencoding::encode as urlencode; use super::encoder::Encoder; use super::service::Service; -use super::Value; -use super::{Owned, Shared}; +use super::{Owned, Shared, Value}; use crate::json; pub struct Index<'a, State = Owned> { diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 200f6a141..5069c9ea6 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -386,7 +386,6 @@ pub fn default_settings(dir: impl AsRef) -> Opt { db_path: dir.as_ref().join("db"), dump_dir: dir.as_ref().join("dumps"), env: "development".to_owned(), - #[cfg(feature = "analytics")] no_analytics: true, max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(), max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(), diff --git a/crates/meilisearch/tests/common/service.rs b/crates/meilisearch/tests/common/service.rs index 8addbacf8..c0b07c217 100644 --- a/crates/meilisearch/tests/common/service.rs +++ b/crates/meilisearch/tests/common/service.rs @@ -9,8 +9,9 @@ use actix_web::test; use actix_web::test::TestRequest; use actix_web::web::Data; use index_scheduler::IndexScheduler; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use meilisearch_auth::AuthController; use tracing::level_filters::LevelFilter; use tracing_subscriber::Layer; @@ -141,7 +142,7 @@ impl Service { Data::new(search_queue), self.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&self.options), + Data::new(Analytics::no_analytics()), true, )) .await diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index 195dca914..c0703e81b 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -23,8 +23,8 @@ async fn error_document_update_create_index_bad_uid() { #[actix_rt::test] async fn document_update_with_primary_key() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = json!([ { @@ -32,15 +32,14 @@ async fn document_update_with_primary_key() { "content": "foo", } ]); - let (_response, code) = index.update_documents(documents, Some("primary")).await; + let (response, code) = index.update_documents(documents, Some("primary")).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(0).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); - assert_eq!(response["uid"], 0); assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["details"]["indexedDocuments"], 1); assert_eq!(response["details"]["receivedDocuments"], 1); @@ -52,8 +51,8 @@ async fn document_update_with_primary_key() { #[actix_rt::test] async fn update_document() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = json!([ { @@ -62,10 +61,10 @@ async fn update_document() { } ]); - let (_response, code) = index.add_documents(documents, None).await; + let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -77,9 +76,9 @@ async fn update_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(1).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); @@ -96,8 +95,8 @@ async fn update_document() { #[actix_rt::test] async fn update_document_gzip_encoded() { - let server = Server::new().await; - let index = server.index_with_encoder("test", Encoder::Gzip); + let server = Server::new_shared(); + let index = server.unique_index_with_encoder(Encoder::Gzip); let documents = json!([ { @@ -106,10 +105,10 @@ async fn update_document_gzip_encoded() { } ]); - let (_response, code) = index.add_documents(documents, None).await; + let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -121,9 +120,9 @@ async fn update_document_gzip_encoded() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(1).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); @@ -140,12 +139,12 @@ async fn update_document_gzip_encoded() { #[actix_rt::test] async fn update_larger_dataset() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(); - index.update_documents(documents, None).await; - index.wait_task(0).await; - let (response, code) = index.get_task(0).await; + let (task, _code) = index.update_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["details"]["indexedDocuments"], 77); @@ -158,8 +157,8 @@ async fn update_larger_dataset() { #[actix_rt::test] async fn error_update_documents_bad_document_id() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); index.create(Some("docid")).await; let documents = json!([ { @@ -167,8 +166,8 @@ async fn error_update_documents_bad_document_id() { "content": "foobar" } ]); - index.update_documents(documents, None).await; - let response = index.wait_task(1).await; + let (task, _code) = index.update_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], json!("failed")); assert_eq!( response["error"]["message"], @@ -186,8 +185,8 @@ async fn error_update_documents_bad_document_id() { #[actix_rt::test] async fn error_update_documents_missing_document_id() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); index.create(Some("docid")).await; let documents = json!([ { @@ -195,8 +194,8 @@ async fn error_update_documents_missing_document_id() { "content": "foobar" } ]); - index.update_documents(documents, None).await; - let response = index.wait_task(1).await; + let (task, _code) = index.update_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); assert_eq!( response["error"]["message"], @@ -212,8 +211,8 @@ async fn error_update_documents_missing_document_id() { #[actix_rt::test] async fn update_faceted_document() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let (response, code) = index .update_settings(json!({ @@ -221,7 +220,7 @@ async fn update_faceted_document() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents: Vec<_> = (0..1000) .map(|id| { @@ -232,10 +231,10 @@ async fn update_faceted_document() { }) .collect(); - let (_response, code) = index.add_documents(documents.into(), None).await; + let (response, code) = index.add_documents(documents.into(), None).await; assert_eq!(code, 202); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -247,7 +246,7 @@ async fn update_faceted_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(2).await; + index.wait_task(response.uid()).await.succeeded(); index .search(json!({"limit": 10}), |response, code| { diff --git a/crates/meilisearch/tests/logs/mod.rs b/crates/meilisearch/tests/logs/mod.rs index 9f4649dca..26482b561 100644 --- a/crates/meilisearch/tests/logs/mod.rs +++ b/crates/meilisearch/tests/logs/mod.rs @@ -7,8 +7,9 @@ use std::str::FromStr; use actix_web::http::header::ContentType; use actix_web::web::Data; use meili_snap::snapshot; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; @@ -54,7 +55,7 @@ async fn basic_test_log_stream_route() { Data::new(search_queue), server.service.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&server.service.options), + Data::new(Analytics::no_analytics()), true, )) .await; diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index e301c0b05..00a65d9aa 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -568,6 +568,57 @@ async fn retrieve_vectors() { ] "###); + // use explicit `_vectors` in displayed attributes + let (response, code) = index + .update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} )) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + // remove `_vectors` from displayed attributes let (response, code) = index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 4f07ca18b..ed45913a8 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -4,6 +4,53 @@ use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; use crate::vector::generate_default_user_provided_documents; +#[actix_rt::test] +async fn field_unavailable_for_source() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "default": {"source": "openAi", "revision": "42"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + #[actix_rt::test] async fn update_embedder() { let server = Server::new().await; diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index ce6c1ad5b..048da6232 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -16,5 +16,6 @@ file-store = { path = "../file-store" } meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } -time = { version = "0.3.36", features = ["formatting"] } +time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 9dbff2486..978824356 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -2,7 +2,7 @@ use std::fs::{read_dir, read_to_string, remove_file, File}; use std::io::BufWriter; use std::path::PathBuf; -use anyhow::{bail, Context}; +use anyhow::Context; use clap::{Parser, Subcommand}; use dump::{DumpWriter, IndexMetadata}; use file_store::FileStore; @@ -10,15 +10,16 @@ use meilisearch_auth::AuthController; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; -use meilisearch_types::milli::index::{db_name, main_key}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; -use meilisearch_types::versioning::{create_version_file, get_version, parse_version}; +use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::Index; use time::macros::format_description; use time::OffsetDateTime; +use upgrade::OfflineUpgrade; use uuid_codec::UuidCodec; +mod upgrade; mod uuid_codec; #[derive(Parser)] @@ -72,7 +73,7 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.0 -> v1.10.0 + /// - v1.9.x -> v1.10.x -> v1.11.x OfflineUpgrade { #[arg(long)] target_version: String, @@ -96,425 +97,6 @@ fn main() -> anyhow::Result<()> { } } -struct OfflineUpgrade { - db_path: PathBuf, - current_version: (String, String, String), - target_version: (String, String, String), -} - -impl OfflineUpgrade { - fn upgrade(self) -> anyhow::Result<()> { - // TODO: if we make this process support more versions, introduce a more flexible way of checking for the version - // currently only supports v1.9 to v1.10 - let (current_major, current_minor, current_patch) = &self.current_version; - - match (current_major.as_str(), current_minor.as_str(), current_patch.as_str()) { - ("1", "9", _) => {} - _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") - } - } - - let (target_major, target_minor, target_patch) = &self.target_version; - - match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("1", "10", _) => {} - _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10") - } - } - - println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); - - self.v1_9_to_v1_10()?; - - println!("Writing VERSION file"); - - create_version_file(&self.db_path, target_major, target_minor, target_patch) - .context("while writing VERSION file after the upgrade")?; - - println!("Success"); - - Ok(()) - } - - fn v1_9_to_v1_10(&self) -> anyhow::Result<()> { - // 2 changes here - - // 1. date format. needs to be done before opening the Index - // 2. REST embedders. We don't support this case right now, so bail - - let index_scheduler_path = self.db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let mut sched_wtxn = env.write_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_wtxn, "index-mapping")?; - - let index_stats: Database = - try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let index_count = - index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; - - // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn - // 1. immutably for the iteration - // 2. mutably for updating index stats - let indexes: Vec<_> = index_mapping - .iter(&sched_wtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - let mut rest_embedders = Vec::new(); - - let mut unwrapped_indexes = Vec::new(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = self.db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_txn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - println!("\t- Checking for incompatible embedders (REST embedders)"); - let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; - - if rest_embedders_for_index.is_empty() { - unwrapped_indexes.push((uid, uuid)); - } else { - // no need to add to unwrapped indexes because we'll exit early - rest_embedders.push((uid, rest_embedders_for_index)); - } - } - - if !rest_embedders.is_empty() { - let rest_embedders = rest_embedders - .into_iter() - .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) - .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) - .collect::>() - .join("\n"); - bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ - The database has not been modified and is still a valid v1.9 database."); - } - - println!("Update can take place, updating"); - - for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { - let index_path = self.db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Updating index `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index `{uid}` at `{}`", - index_path.display() - ) - })?; - - println!("\t- Updating index stats"); - update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; - println!("\t- Updating date format"); - update_date_format(&uid, &index_env, &mut index_wtxn)?; - - index_wtxn.commit().with_context(|| { - format!( - "while committing the write txn for index `{uid}` at {}", - index_path.display() - ) - })?; - } - - sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; - - println!("Upgrading database succeeded"); - - Ok(()) - } -} - -pub mod v1_9 { - pub type FieldDistribution = std::collections::BTreeMap; - - /// The statistics that can be computed from an `Index` object. - #[derive(serde::Serialize, serde::Deserialize, Debug)] - pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - pub updated_at: time::OffsetDateTime, - } - - use serde::{Deserialize, Serialize}; - - #[derive(Debug, Deserialize, Serialize)] - pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, - } - - #[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] - pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, - } - - /// Options of an embedder, specific to each kind of embedder. - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), - } - - impl Default for EmbedderOptions { - fn default() -> Self { - Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) - } - } - - mod hf { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub model: String, - pub revision: Option, - } - } - mod openai { - - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - } - } - mod ollama { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub embedding_model: String, - pub url: Option, - pub api_key: Option, - } - } - mod manual { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub dimensions: usize, - } - } - mod rest { - #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - pub url: String, - pub input_field: Vec, - // path to the array of embeddings - pub path_to_embeddings: Vec, - // shape of a single embedding - pub embedding_object: Vec, - } - } - - pub type OffsetDateTime = time::OffsetDateTime; -} - -pub mod v1_10 { - use crate::v1_9; - - pub type FieldDistribution = std::collections::BTreeMap; - - /// The statistics that can be computed from an `Index` object. - #[derive(serde::Serialize, serde::Deserialize, Debug)] - pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - #[serde(with = "time::serde::rfc3339")] - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - #[serde(with = "time::serde::rfc3339")] - pub updated_at: time::OffsetDateTime, - } - - impl From for IndexStats { - fn from( - v1_9::IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - }: v1_9::IndexStats, - ) -> Self { - IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - } - } - } - - #[derive(serde::Serialize, serde::Deserialize)] - #[serde(transparent)] - pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); -} - -fn update_index_stats( - index_stats: Database, - index_uid: &str, - index_uuid: uuid::Uuid, - sched_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let ctx = || format!("while updating index stats for index `{index_uid}`"); - - let stats: Option = index_stats - .remap_data_type::>() - .get(sched_wtxn, &index_uuid) - .with_context(ctx)?; - - if let Some(stats) = stats { - let stats: v1_10::IndexStats = stats.into(); - - index_stats - .remap_data_type::>() - .put(sched_wtxn, &index_uuid, &stats) - .with_context(ctx)?; - } - - Ok(()) -} - -fn update_date_format( - index_uid: &str, - index_env: &Env, - index_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) - .with_context(|| format!("while updating date format for index `{index_uid}`"))?; - - date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; - date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; - - Ok(()) -} - -fn find_rest_embedders( - index_uid: &str, - index_env: &Env, - index_txn: &RoTxn, -) -> anyhow::Result> { - let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) - .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; - - let mut rest_embedders = vec![]; - - for config in main - .remap_types::>>() - .get(index_txn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default() - { - if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { - rest_embedders.push(config.name); - } - } - - Ok(rest_embedders) -} - -fn date_round_trip( - wtxn: &mut RwTxn, - index_uid: &str, - db: Database, - key: &str, -) -> anyhow::Result<()> { - let datetime = - db.remap_types::>().get(wtxn, key).with_context( - || format!("could not read `{key}` while updating date format for index `{index_uid}`"), - )?; - - if let Some(datetime) = datetime { - db.remap_types::>() - .put(wtxn, key, &v1_10::OffsetDateTime(datetime)) - .with_context(|| { - format!( - "could not write `{key}` while updating date format for index `{index_uid}`" - ) - })?; - } - - Ok(()) -} - /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs new file mode 100644 index 000000000..36630c3b3 --- /dev/null +++ b/crates/meilitool/src/upgrade/mod.rs @@ -0,0 +1,73 @@ +mod v1_10; +mod v1_11; +mod v1_9; + +use std::path::{Path, PathBuf}; + +use anyhow::{bail, Context}; +use meilisearch_types::versioning::create_version_file; + +use v1_10::v1_9_to_v1_10; + +use crate::upgrade::v1_11::v1_10_to_v1_11; + +pub struct OfflineUpgrade { + pub db_path: PathBuf, + pub current_version: (String, String, String), + pub target_version: (String, String, String), +} + +impl OfflineUpgrade { + pub fn upgrade(self) -> anyhow::Result<()> { + let upgrade_list = [ + (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), + (v1_10_to_v1_11, "1", "11", "0"), + ]; + + let (current_major, current_minor, current_patch) = &self.current_version; + + let start_at = match ( + current_major.as_str(), + current_minor.as_str(), + current_patch.as_str(), + ) { + ("1", "9", _) => 0, + ("1", "10", _) => 1, + _ => { + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") + } + }; + + let (target_major, target_minor, target_patch) = &self.target_version; + + let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { + ("1", "10", _) => 0, + ("1", "11", _) => 1, + (major, _, _) if major.starts_with('v') => { + bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") + } + _ => { + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11") + } + }; + + println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + + #[allow(clippy::needless_range_loop)] + for index in start_at..=ends_at { + let (func, major, minor, patch) = upgrade_list[index]; + (func)(&self.db_path)?; + println!("Done"); + // We're writing the version file just in case an issue arise _while_ upgrading. + // We don't want the DB to fail in an unknown state. + println!("Writing VERSION file"); + + create_version_file(&self.db_path, major, minor, patch) + .context("while writing VERSION file after the upgrade")?; + } + + println!("Success"); + + Ok(()) + } +} diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs new file mode 100644 index 000000000..3dd7c72a2 --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -0,0 +1,289 @@ +use anyhow::bail; +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{ + types::{SerdeJson, Str}, + Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, + }, + milli::index::{db_name, main_key}, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +use super::v1_9; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + #[serde(with = "time::serde::rfc3339")] + pub created_at: time::OffsetDateTime, + /// Date of the last update of the index. + #[serde(with = "time::serde::rfc3339")] + pub updated_at: time::OffsetDateTime, +} + +impl From for IndexStats { + fn from( + v1_9::IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at, + updated_at, + }: v1_9::IndexStats, + ) -> Self { + IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at: created_at.0, + updated_at: updated_at.0, + } + } +} + +#[derive(serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); + +fn update_index_stats( + index_stats: Database, + index_uid: &str, + index_uuid: uuid::Uuid, + sched_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let ctx = || format!("while updating index stats for index `{index_uid}`"); + + let stats: Option<&str> = index_stats + .remap_data_type::() + .get(sched_wtxn, &index_uuid) + .with_context(ctx) + .with_context(|| "While reading value")?; + dbg!(stats); + + let stats: Option = index_stats + .remap_data_type::>() + .get(sched_wtxn, &index_uuid) + .with_context(ctx) + .with_context(|| "While reading value")?; + + if let Some(stats) = stats { + let stats: self::IndexStats = stats.into(); + + index_stats + .remap_data_type::>() + .put(sched_wtxn, &index_uuid, &stats) + .with_context(ctx) + .with_context(|| "While writing value")?; + } + + Ok(()) +} + +fn update_date_format( + index_uid: &str, + index_env: &Env, + index_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) + .with_context(|| format!("while updating date format for index `{index_uid}`"))?; + + date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; + date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; + + Ok(()) +} + +fn find_rest_embedders( + index_uid: &str, + index_env: &Env, + index_txn: &RoTxn, +) -> anyhow::Result> { + let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) + .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; + + let mut rest_embedders = vec![]; + + for config in main + .remap_types::>>() + .get(index_txn, main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default() + { + if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { + rest_embedders.push(config.name); + } + } + + Ok(rest_embedders) +} + +fn date_round_trip( + wtxn: &mut RwTxn, + index_uid: &str, + db: Database, + key: &str, +) -> anyhow::Result<()> { + let datetime = + db.remap_types::>().get(wtxn, key).with_context( + || format!("could not read `{key}` while updating date format for index `{index_uid}`"), + )?; + + if let Some(datetime) = datetime { + db.remap_types::>() + .put(wtxn, key, &self::OffsetDateTime(datetime.0)) + .with_context(|| { + format!( + "could not write `{key}` while updating date format for index `{index_uid}`" + ) + })?; + } + + Ok(()) +} + +pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.9.0 to v1.10.0"); + // 2 changes here + + // 1. date format. needs to be done before opening the Index + // 2. REST embedders. We don't support this case right now, so bail + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let mut sched_wtxn = env.write_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_wtxn, "index-mapping")?; + + let index_stats: Database = + try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { + format!("While trying to open {:?}", index_scheduler_path.display()) + })?; + + let index_count = + index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; + + // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn + // 1. immutably for the iteration + // 2. mutably for updating index stats + let indexes: Vec<_> = index_mapping + .iter(&sched_wtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + let mut rest_embedders = Vec::new(); + + let mut unwrapped_indexes = Vec::new(); + + // check that update can take place + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_txn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + println!("\t- Checking for incompatible embedders (REST embedders)"); + let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; + + if rest_embedders_for_index.is_empty() { + unwrapped_indexes.push((uid, uuid)); + } else { + // no need to add to unwrapped indexes because we'll exit early + rest_embedders.push((uid, rest_embedders_for_index)); + } + } + + if !rest_embedders.is_empty() { + let rest_embedders = rest_embedders + .into_iter() + .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) + .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) + .collect::>() + .join("\n"); + bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ + The database has not been modified and is still a valid v1.9 database."); + } + + println!("Update can take place, updating"); + + for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Updating index `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index `{uid}` at `{}`", + index_path.display() + ) + })?; + + println!("\t- Updating index stats"); + update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; + println!("\t- Updating date format"); + update_date_format(&uid, &index_env, &mut index_wtxn)?; + + index_wtxn.commit().with_context(|| { + format!("while committing the write txn for index `{uid}` at {}", index_path.display()) + })?; + } + + sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; + + println!("Upgrading database succeeded"); + + Ok(()) +} diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs new file mode 100644 index 000000000..0c84d3842 --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -0,0 +1,85 @@ +//! The breaking changes that happened between the v1.10 and the v1.11 are: +//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0 +//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata. +//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB. +//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything. + +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{types::Str, Database, EnvOpenOptions}, + milli::index::db_name, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.10.0 to v1.11.0"); + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let sched_rtxn = env.read_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_rtxn, "index-mapping")?; + + let index_count = + index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?; + + let indexes: Vec<_> = index_mapping + .iter(&sched_rtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Updating embeddings for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_rtxn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a read transaction for index {uid} at {}", + index_path.display() + ) + })?; + let index_read_database = + try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + let index_write_database = + try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + + arroy_v04_to_v05::ugrade_from_prev_version( + &index_rtxn, + index_read_database, + &mut index_wtxn, + index_write_database, + )?; + + index_wtxn.commit()?; + } + + Ok(()) +} diff --git a/crates/meilitool/src/upgrade/v1_9.rs b/crates/meilitool/src/upgrade/v1_9.rs new file mode 100644 index 000000000..96cbfe68c --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_9.rs @@ -0,0 +1,158 @@ +use serde::{Deserialize, Serialize}; +use time::{Date, OffsetDateTime, Time, UtcOffset}; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + pub created_at: LegacyDateTime, + /// Date of the last update of the index. + pub updated_at: LegacyDateTime, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, +} + +#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingConfig { + /// Options of the embedder, specific to each kind of embedder + pub embedder_options: EmbedderOptions, +} + +/// Options of an embedder, specific to each kind of embedder. +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum EmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), +} + +impl Default for EmbedderOptions { + fn default() -> Self { + Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) + } +} + +mod hf { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub model: String, + pub revision: Option, + } +} +mod openai { + + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + } +} +mod ollama { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub embedding_model: String, + pub url: Option, + pub api_key: Option, + } +} +mod manual { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub dimensions: usize, + } +} +mod rest { + #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + pub url: String, + pub input_field: Vec, + // path to the array of embeddings + pub path_to_embeddings: Vec, + // shape of a single embedding + pub embedding_object: Vec, + } +} + +/// A datetime from Meilisearch v1.9 with an unspecified format. +#[derive(Debug)] +pub struct LegacyDateTime(pub OffsetDateTime); + +impl<'de> Deserialize<'de> for LegacyDateTime { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct Visitor; + impl<'de> serde::de::Visitor<'de> for Visitor { + type Value = OffsetDateTime; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a valid datetime") + } + + // Comes from a binary. The legacy format is: + // 2024-11-04 13:32:08.48368 +00:00:00 + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); + OffsetDateTime::parse(v, format).map_err(E::custom) + } + + // Comes from the docker image, the legacy format is: + // [2024, 309, 17, 15, 1, 698184971, 0,0,0] + // year, day in year, hour, minute, sec, subsec , offset stuff + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut vec = Vec::new(); + // We must deserialize the value as `i64` because the largest values are `u32` and `i32` + while let Some(el) = seq.next_element::()? { + vec.push(el); + } + if vec.len() != 9 { + return Err(serde::de::Error::custom(format!( + "Invalid datetime, received an array of {} elements instead of 9", + vec.len() + ))); + } + Ok(OffsetDateTime::new_in_offset( + Date::from_ordinal_date(vec[0] as i32, vec[1] as u16) + .map_err(serde::de::Error::custom)?, + Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32) + .map_err(serde::de::Error::custom)?, + UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8) + .map_err(serde::de::Error::custom)?, + )) + } + } + deserializer.deserialize_any(Visitor).map(LegacyDateTime) + } +} diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 5fc2d65c8..7b43fbf33 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -15,7 +15,7 @@ license.workspace = true bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" bstr = "1.9.1" -bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } +bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" charabia = { version = "0.9.1", default-features = false } concat-arrays = "0.1.2" @@ -79,8 +79,8 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" -rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } +arroy = "0.5.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } @@ -98,16 +98,7 @@ rand = { version = "0.8.5", features = ["small_rng"] } [features] all-tokenizations = [ - "charabia/chinese", - "charabia/hebrew", - "charabia/japanese", - "charabia/thai", - "charabia/korean", - "charabia/greek", - "charabia/khmer", - "charabia/vietnamese", - "charabia/swedish-recomposition", - "charabia/german-segmentation", + "charabia/default", ] # Use POSIX semaphores instead of SysV semaphores in LMDB @@ -146,5 +137,8 @@ german = ["charabia/german-segmentation"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] +# allow turkish specialized tokenization +turkish = ["charabia/turkish"] + # allow CUDA support, see cuda = ["candle-core/cuda"] diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index f4bd42689..3b48b50f2 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -298,6 +298,7 @@ impl From for Error { arroy::Error::InvalidVecDimension { expected, received } => { Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) } + arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index c47896df7..5b7a9c58c 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1610,24 +1610,6 @@ impl Index { .unwrap_or_default()) } - pub fn arroy_readers<'a>( - &'a self, - rtxn: &'a RoTxn<'a>, - embedder_id: u8, - quantized: bool, - ) -> impl Iterator> + 'a { - crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| { - let reader = ArroyWrapper::new(self.vector_arroy, k, quantized); - // Here we don't care about the dimensions, but we want to know if we can read - // in the database or if its metadata are missing because there is no document with that many vectors. - match reader.dimensions(rtxn) { - Ok(_) => Some(Ok(reader)), - Err(arroy::Error::MissingMetadata(_)) => None, - Err(e) => Some(Err(e.into())), - } - }) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1649,14 +1631,9 @@ impl Index { let embedding_configs = self.embedding_configs(rtxn)?; for config in embedding_configs { let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let embeddings = self - .arroy_readers(rtxn, embedder_id, config.config.quantized()) - .map_while(|reader| { - reader - .and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into())) - .transpose() - }) - .collect::>>()?; + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embeddings = reader.item_vectors(rtxn, docid)?; res.insert(config.name.to_owned(), embeddings); } Ok(res) diff --git a/crates/milli/src/search/new/matches/best_match_interval.rs b/crates/milli/src/search/new/matches/best_match_interval.rs new file mode 100644 index 000000000..a6497f351 --- /dev/null +++ b/crates/milli/src/search/new/matches/best_match_interval.rs @@ -0,0 +1,139 @@ +use super::matching_words::WordId; +use super::{Match, MatchPosition}; + +struct MatchIntervalWithScore { + interval: [usize; 2], + score: [i16; 3], +} + +// count score for phrases +fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_interval_score(matches: &[Match]) -> [i16; 3] { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniq_score, distance_score, order_score] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { + if matches.is_empty() { + panic!("`matches` should not be empty at this point"); + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = get_interval_score(&matches[interval_first..=interval_last]); + let is_interval_score_better = &best_interval + .as_ref() + .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + + if *is_interval_score_better { + best_interval = Some(MatchIntervalWithScore { + interval: [interval_first, interval_last], + score: interval_score, + }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + // keep interval if it's the best + save_best_interval(interval_first, interval_last); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + if interval_first == matches.len() { + interval_first -= 1; + break; + } + + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + save_best_interval(interval_first, interval_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_interval.map_or( + [&matches[0], &matches[0]], + |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], + ) +} diff --git a/crates/milli/src/search/new/matches/match.rs b/crates/milli/src/search/new/matches/match.rs new file mode 100644 index 000000000..2eef4d5a6 --- /dev/null +++ b/crates/milli/src/search/new/matches/match.rs @@ -0,0 +1,62 @@ +use super::matching_words::WordId; + +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: [usize; 2], + // position of the first and last token in the phrase in the whole text. + token_positions: [usize; 2], + }, +} + +#[derive(Clone, Debug)] +pub struct Match { + pub char_count: usize, + // ids of the query words that matches. + pub ids: Vec, + pub position: MatchPosition, +} + +impl Match { + pub(super) fn get_first_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + } + } + + pub(super) fn get_last_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + } + } + + pub(super) fn get_first_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + } + } + + pub(super) fn get_last_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + } + } + + pub(super) fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + } + } +} diff --git a/crates/milli/src/search/new/matches/matching_words.rs b/crates/milli/src/search/new/matches/matching_words.rs index 4ad5c37ec..1f30a17ad 100644 --- a/crates/milli/src/search/new/matches/matching_words.rs +++ b/crates/milli/src/search/new/matches/matching_words.rs @@ -86,14 +86,17 @@ impl MatchingWords { continue; }; let prefix_length = char_index + c.len_utf8(); - let char_len = token.original_lengths(prefix_length).0; + let (char_count, byte_len) = token.original_lengths(prefix_length); let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { ids, char_count, byte_len }); // else we exact match the token. } else if token.lemma() == word { - let char_len = token.char_end - token.char_start; let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }); } } } @@ -130,7 +133,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> { word.map(|word| self.matching_words.word_interner.get(word).as_str()) }) .collect(); - let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + let partial = PartialMatch { matching_words: words, ids }; partial.match_token(self.token).or_else(|| self.next()) } @@ -149,7 +152,7 @@ pub type WordId = u16; /// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a RangeInclusive }, + Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, Partial(PartialMatch<'a>), } @@ -158,7 +161,6 @@ pub enum MatchType<'a> { pub struct PartialMatch<'a> { matching_words: Vec>, ids: &'a RangeInclusive, - char_len: usize, } impl<'a> PartialMatch<'a> { @@ -176,25 +178,24 @@ impl<'a> PartialMatch<'a> { None => token.is_stopword(), }; - let char_len = token.char_end - token.char_start; // if there are remaining words to match in the phrase and the current token is matching, // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len, ids }) + Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }) // if the current token doesn't match, return None to break the match sequence. } else { None } } - - pub fn char_len(&self) -> usize { - self.char_len - } } impl fmt::Debug for MatchingWords { @@ -276,7 +277,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) ); assert_eq!( matching_words @@ -300,7 +301,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words @@ -312,7 +313,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 4688b8f32..80e3ec7b2 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -1,11 +1,19 @@ -use std::borrow::Cow; +mod best_match_interval; +mod r#match; +mod matching_words; +mod simple_token_kind; use charabia::{Language, SeparatorKind, Token, Tokenizer}; +use either::Either; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch, WordId}; +use matching_words::{MatchType, PartialMatch}; +use r#match::{Match, MatchPosition}; use serde::Serialize; - -pub mod matching_words; +use simple_token_kind::SimpleTokenKind; +use std::{ + borrow::Cow, + cmp::{max, min}, +}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -93,17 +101,6 @@ impl FormatOptions { } } -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, -} - #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, @@ -130,41 +127,27 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch<'a>, - token_position: usize, - word_position: usize, + first_token_position: usize, + first_word_position: usize, + first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } + Some(MatchType::Partial(partial)) => partial, // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().collect(); - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.clone(), - word_position, - token_position, - }, - ); - matches.extend(iter); - + Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: char_len, - ids, - word_position, - token_position, + char_count: word.char_end - *first_word_char_start, + ids: ids.clone().collect(), + position: MatchPosition::Phrase { + word_positions: [first_word_position, word_position], + token_positions: [first_token_position, token_position], + }, }); // the match is complete, we return true. @@ -202,13 +185,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match match_type { // we match, we save the current token as a match, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { + MatchType::Full { ids, char_count, .. } => { let ids: Vec<_> = ids.clone().collect(); matches.push(Match { - match_len: char_len, + char_count, ids, - word_position, - token_position, + position: MatchPosition::Word { word_position, token_position }, }); break; } @@ -221,6 +203,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { partial, token_position, word_position, + &word.char_start, &mut wp, &mut matches, ) { @@ -243,56 +226,99 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, - length: m.match_len, + start: tokens[m.get_first_token_pos()].byte_start, + // TODO: Why is this in chars, while start is in bytes? + length: m.char_count, }) .collect(), } } /// Returns the bounds in byte index of the crop window. - fn crop_bounds( - &self, - tokens: &[Token<'_>], - matches: &[Match], - crop_size: usize, - ) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { + let ( + mut remaining_words, + is_iterating_forward, + before_tokens_starting_index, + after_tokens_starting_index, + ) = if !matches.is_empty() { + let [matches_first, matches_last] = + best_match_interval::find_best_match_interval(matches, crop_size); - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + let matches_size = + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; + + let is_crop_size_gte_match_size = crop_size >= matches_size; + let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size; + + let remaining_words = if is_crop_size_gte_match_size { + crop_size - matches_size + } else { + // in case matches size is greater than crop size, which implies there's only one match, + // we count words backwards, because we have to remove words, as they're extra words outside of + // crop window + matches_size - crop_size + }; + + let after_tokens_starting_index = if matches_size == 0 { + 0 + } else { + let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1; + if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + // we have matched the end of possible tokens, there's nothing to advance + tokens.len() - 1 + } + }; + + ( + remaining_words, + is_iterating_forward, + if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 }, + after_tokens_starting_index, + ) + } else { + (crop_size, true, 0, 0) + }; // create the initial state of the crop window: 2 iterators starting from the matches positions, // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); + // an iterator ... + let mut after_tokens = if is_iterating_forward { + // ... starting from the last match token position and going towards the end of the text. + Either::Left(tokens[after_tokens_starting_index..].iter().peekable()) + } else { + // ... starting from the last match token position and going towards the start of the text. + Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable()) + }; // grows the crop window peeking in both directions // until the window contains the good number of words: while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = + after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new); - match (before_token, after_token) { + match (before_token_kind, after_token_kind) { // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { // if they are both separators and are the same kind then advance both, // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { before_tokens.next(); // this avoid having an ending separator before crop marker. if remaining_words > 1 { after_tokens.next(); } - } else if before_token_kind == SeparatorKind::Hard { + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) { after_tokens.next(); } else { before_tokens.next(); @@ -300,17 +326,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // if one of the tokens is a word, we expend in the side of the word. // left is a word, advance left. - (None, Some(_)) => { + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { before_tokens.next(); remaining_words -= 1; } // right is a word, advance right. - (Some(_), None) => { + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { after_tokens.next(); remaining_words -= 1; } // both are words, advance left then right if remaining_word > 0. - (None, None) => { + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { before_tokens.next(); remaining_words -= 1; @@ -322,16 +348,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } // the end of the text is reached, advance left. - (Some(before_token), None) => { + (Some(before_token_kind), None) => { before_tokens.next(); - if before_token.is_none() { + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } // the start of the text is reached, advance right. - (None, Some(after_token)) => { + (None, Some(after_token_kind)) => { after_tokens.next(); - if after_token.is_none() { + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } @@ -344,86 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - (crop_byte_start, crop_byte_end) - } - - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - - // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { - interval_first += 1; - } - } - interval_last = index; - } - - // compute the last interval score and compare it to the best one. - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches - } + [crop_byte_start, crop_byte_end] } // Returns the formatted version of the original text. @@ -434,69 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { + // If the text has to be cropped, crop around the best interval. + let [crop_byte_start, crop_byte_end] = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = self.find_best_match_interval(matches, crop_size); self.crop_bounds(tokens, matches, crop_size) } - _ => (0, self.text.len()), + _ => [0, self.text.len()], }; let mut formatted = Vec::new(); // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { + if crop_byte_start > 0 && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } - let mut byte_index = byte_start; + let mut byte_index = crop_byte_start; if format_options.highlight { // insert highlight markers around matches. for m in matches { - let token = &tokens[m.token_position]; + let [m_byte_start, m_byte_end] = match m.position { + MatchPosition::Word { token_position, .. } => { + let token = &tokens[token_position]; + [&token.byte_start, &token.byte_end] + } + MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { + [&tokens[ftp].byte_start, &tokens[ltp].byte_end] + } + }; - // skip matches out of the crop window. - if token.byte_start < byte_start || token.byte_end > byte_end { + // skip matches out of the crop window + if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { continue; } - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); + // adjust start and end to the crop window size + let [m_byte_start, m_byte_end] = [ + max(m_byte_start, &crop_byte_start), + min(m_byte_end, &crop_byte_end), + ]; + + // push text that is positioned before our matches + if byte_index < *m_byte_start { + formatted.push(&self.text[byte_index..*m_byte_start]); } - let highlight_byte_index = self.text[token.byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); + + // TODO: This is additional work done, charabia::token::Token byte_len + // should already get us the original byte length, however, that doesn't work as + // it's supposed to, investigate why + let highlight_byte_index = self.text[*m_byte_start..] + .char_indices() + .nth(m.char_count) + .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); + formatted.push(&self.text[*m_byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + if highlight_byte_index < *m_byte_end { + formatted.push(&self.text[highlight_byte_index..*m_byte_end]); } - byte_index = token.byte_end; + byte_index = *m_byte_end; } } // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); + if byte_index < crop_byte_end { + formatted.push(&self.text[byte_index..crop_byte_end]); } // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { + if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } if formatted.len() == 1 { // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) + Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) } else { Cow::Owned(formatted.concat()) } @@ -821,22 +786,24 @@ mod tests { fn format_highlight_crop_phrase_query() { //! testing: https://github.com/meilisearch/meilisearch/issues/3975 let temp_index = TempIndex::new(); + + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; temp_index .add_documents(documents!([ - { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + { "id": 1, "text": text } ])) .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…the power to split the world between those who embraced…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); @@ -844,7 +811,63 @@ mod tests { // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + @"…world between those who embraced progress and those who resisted…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + // TODO: Should include exclamation mark without crop markers + @"…between those who embraced progress and those who resisted change…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"split the world between\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" ); } @@ -900,7 +923,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + @"_the_ _do or_ die can't be he do and or isn'_t he_" ); } } diff --git a/crates/milli/src/search/new/matches/simple_token_kind.rs b/crates/milli/src/search/new/matches/simple_token_kind.rs new file mode 100644 index 000000000..b34a8c985 --- /dev/null +++ b/crates/milli/src/search/new/matches/simple_token_kind.rs @@ -0,0 +1,15 @@ +use charabia::{SeparatorKind, Token, TokenKind}; + +pub enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + pub fn new(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } +} diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index de1dacbe7..90377c09c 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -1,11 +1,10 @@ use std::iter::FromIterator; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::score_details::{self, ScoreDetails}; -use crate::vector::{DistributionShift, Embedder}; +use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; use crate::{DocumentId, Result, SearchContext, SearchLogger}; pub struct VectorSort { @@ -53,14 +52,9 @@ impl VectorSort { vector_candidates: &RoaringBitmap, ) -> Result<()> { let target = &self.target; - let mut results = Vec::new(); - for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) { - let nns_by_vector = - reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; - results.extend(nns_by_vector.into_iter()); - } - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); + let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); Ok(()) diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 0cb8d723d..5547d800e 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -1,10 +1,9 @@ use std::sync::Arc; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::score_details::{self, ScoreDetails}; -use crate::vector::Embedder; +use crate::vector::{ArroyWrapper, Embedder}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; pub struct Similar<'a> { @@ -71,23 +70,13 @@ impl<'a> Similar<'a> { .get(self.rtxn, &self.embedder_name)? .ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?; - let mut results = Vec::new(); - - for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) { - let nns_by_item = reader?.nns_by_item( - self.rtxn, - self.id, - self.limit + self.offset + 1, - Some(&universe), - )?; - if let Some(mut nns_by_item) = nns_by_item { - results.append(&mut nns_by_item); - } else { - break; - } - } - - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); + let results = reader.nns_by_item( + self.rtxn, + self.id, + self.limit + self.offset + 1, + Some(&universe), + )?; let mut documents_ids = Vec::with_capacity(self.limit); let mut document_scores = Vec::with_capacity(self.limit); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 326dd842d..88d20fff0 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -689,9 +689,8 @@ where key: None, }, )?; - let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap(); let reader = - ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized); + ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); let dim = reader.dimensions(self.wtxn)?; dimension.insert(name.to_string(), dim); } @@ -700,6 +699,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; let vector_arroy = self.index.vector_arroy; + let cancel = &self.should_abort; let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, @@ -713,17 +713,8 @@ where let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized); pool.install(|| { - for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { - let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized); - if is_quantizing { - writer.quantize(wtxn, k, dimension)?; - } - if writer.need_build(wtxn, dimension)? { - writer.build(wtxn, &mut rng, dimension)?; - } else if writer.is_empty(wtxn, dimension)? { - break; - } - } + let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index bb2cfe56c..763f30d0f 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -990,27 +990,24 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let readers: Result, &RoaringBitmap)>> = settings_diff + let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff .embedding_config_updates .iter() .filter_map(|(name, action)| { if let Some(WriteBackToDocuments { embedder_id, user_provided }) = action.write_back() { - let readers: Result> = self - .index - .arroy_readers(wtxn, *embedder_id, action.was_quantized) - .collect(); - match readers { - Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), - Err(error) => Some(Err(error)), - } + let reader = ArroyWrapper::new( + self.index.vector_arroy, + *embedder_id, + action.was_quantized, + ); + Some((name.as_str(), (reader, user_provided))) } else { None } }) .collect(); - let readers = readers?; let old_vectors_fid = settings_diff .old @@ -1048,34 +1045,24 @@ impl<'a, 'i> Transform<'a, 'i> { arroy::Error, > = readers .iter() - .filter_map(|(name, (readers, user_provided))| { + .filter_map(|(name, (reader, user_provided))| { if !user_provided.contains(docid) { return None; } - let mut vectors = Vec::new(); - for reader in readers { - let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { - break; - }; - - match vector { - Ok(vector) => vectors.push(vector), - Err(error) => return Some(Err(error)), - } + match reader.item_vectors(wtxn, docid) { + Ok(vectors) if vectors.is_empty() => None, + Ok(vectors) => Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(vectors), + ), + regenerate: false, + }) + .unwrap(), + ))), + Err(e) => Some(Err(e)), } - if vectors.is_empty() { - return None; - } - Some(Ok(( - name.to_string(), - serde_json::to_value(ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - vectors, - )), - regenerate: false, - }) - .unwrap(), - ))) }) .collect(); @@ -1104,11 +1091,9 @@ impl<'a, 'i> Transform<'a, 'i> { } // delete all vectors from the embedders that need removal - for (_, (readers, _)) in readers { - for reader in readers { - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; - } + for (_, (reader, _)) in readers { + let dimensions = reader.dimensions(wtxn)?; + reader.clear(wtxn, dimensions)?; } let grenad_params = GrenadParameters { diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 97a4bf712..20e70b2a6 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -673,22 +673,14 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .map_or(false, |conf| conf.2); // FIXME: allow customizing distance - let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index) - .map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized)) - .collect(); + let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - - for writer in &writers { - // Uses invariant: vectors are packed in the first writers. - if !writer.del_item(wtxn, expected_dimension, docid)? { - break; - } - } + writer.del_items(wtxn, expected_dimension, docid)?; } // add generated embeddings @@ -716,9 +708,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings.embedding_count(), ))); } - for (embedding, writer) in embeddings.iter().zip(&writers) { - writer.add_item(wtxn, expected_dimension, docid, embedding)?; - } + writer.add_items(wtxn, docid, &embeddings)?; } // perform the manual diff @@ -733,51 +723,14 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { let vector: Vec = pod_collect_to_vec(value); - let mut deleted_index = None; - for (index, writer) in writers.iter().enumerate() { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - // uses invariant: vectors are packed in the first writers. - break; - }; - if candidate == vector { - writer.del_item(wtxn, expected_dimension, docid)?; - deleted_index = Some(index); - } - } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for (index, writer) in writers.iter().enumerate().skip(deleted_index) { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - // unwrap: computed the index from the list of writers - let writer = writers.get(last_index).unwrap(); - writer.del_item(wtxn, expected_dimension, docid)?; - writers.get(deleted_index).unwrap().add_item( - wtxn, - expected_dimension, - docid, - &vector, - )?; - } - } + writer.del_item(wtxn, docid, &vector)?; } if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { let vector = pod_collect_to_vec(value); // overflow was detected during vector extraction. - for writer in &writers { - if !writer.contains_item(wtxn, expected_dimension, docid)? { - writer.add_item(wtxn, expected_dimension, docid, &vector)?; - break; - } - } + writer.add_item(wtxn, docid, &vector)?; } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index d52e68bbe..571c02c8c 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use arroy::distances::{Angular, BinaryQuantizedAngular}; +use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::ItemId; use deserr::{DeserializeError, Deserr}; use heed::{RoTxn, RwTxn, Unspecified}; @@ -32,105 +32,243 @@ pub const REQUEST_PARALLELISM: usize = 40; pub struct ArroyWrapper { quantized: bool, - index: u16, + embedder_index: u8, database: arroy::Database, } impl ArroyWrapper { - pub fn new(database: arroy::Database, index: u16, quantized: bool) -> Self { - Self { database, index, quantized } + pub fn new( + database: arroy::Database, + embedder_index: u8, + quantized: bool, + ) -> Self { + Self { database, embedder_index, quantized } } - pub fn index(&self) -> u16 { - self.index + pub fn embedder_index(&self) -> u8 { + self.embedder_index + } + + fn readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) } pub fn dimensions(&self, rtxn: &RoTxn) -> Result { + let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); if self.quantized { - Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) } else { - Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) } } - pub fn quantize( + pub fn build_and_quantize( &mut self, wtxn: &mut RwTxn, - index: u16, + rng: &mut R, dimension: usize, + quantizing: bool, + cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { - if !self.quantized { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - writer.prepare_changing_distance::(wtxn)?; - self.quantized = true; + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + writer.builder(rng).build(wtxn)? + } else if writer.is_empty(wtxn)? { + break; + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performances + // sensitive. + if quantizing && !self.quantized { + let writer = writer.prepare_changing_distance::(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; + } else if writer.need_build(wtxn)? { + writer.builder(rng).cancel(cancel).build(wtxn)?; + } else if writer.is_empty(wtxn)? { + break; + } + } } Ok(()) } - pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn) - } - } - - pub fn build( + /// Overwrite all the embeddings associated with the index and item ID. + /// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings. + /// You should call `del_items` on the `item_id` before calling this method. + /// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored. + pub fn add_items( &self, wtxn: &mut RwTxn, - rng: &mut R, - dimension: usize, + item_id: arroy::ItemId, + embeddings: &Embeddings, ) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None) + let dimension = embeddings.dimension(); + for (index, vector) in + arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + { + if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } } + Ok(()) } + /// Add one document int for this index where we can find an empty spot. pub fn add_item( &self, wtxn: &mut RwTxn, - dimension: usize, item_id: arroy::ItemId, vector: &[f32], ) -> Result<(), arroy::Error> { if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) + self._add_item(wtxn, self.quantized_db(), item_id, vector) } else { - arroy::Writer::new(self.angular_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) + self._add_item(wtxn, self.angular_db(), item_id, vector) } } - pub fn del_item( + fn _add_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + for index in arroy_db_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, vector)?; + break; + } + } + Ok(()) + } + + /// Delete all embeddings from a specific `item_id` + pub fn del_items( &self, wtxn: &mut RwTxn, dimension: usize, item_id: arroy::ItemId, + ) -> Result<(), arroy::Error> { + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if !writer.del_item(wtxn, item_id)? { + break; + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if !writer.del_item(wtxn, item_id)? { + break; + } + } + } + + Ok(()) + } + + /// Delete one item. + pub fn del_item( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + vector: &[f32], ) -> Result { if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).del_item(wtxn, item_id) + self._del_item(wtxn, self.quantized_db(), item_id, vector) } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).del_item(wtxn, item_id) + self._del_item(wtxn, self.angular_db(), item_id, vector) } } + fn _del_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result { + let dimension = vector.len(); + let mut deleted_index = None; + + for index in arroy_db_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + // uses invariant: vectors are packed in the first writers. + break; + }; + if candidate == vector { + writer.del_item(wtxn, item_id)?; + deleted_index = Some(index); + } + } + + // 🥲 enforce invariant: vectors are packed in the first writers. + if let Some(deleted_index) = deleted_index { + let mut last_index_with_a_vector = None; + for index in + arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) + { + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + break; + }; + last_index_with_a_vector = Some((index, candidate)); + } + if let Some((last_index, vector)) = last_index_with_a_vector { + let writer = arroy::Writer::new(db, last_index, dimension); + writer.del_item(wtxn, item_id)?; + let writer = arroy::Writer::new(db, deleted_index, dimension); + writer.add_item(wtxn, item_id, &vector)?; + } + } + Ok(deleted_index.is_some()) + } + pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).clear(wtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).clear(wtxn) - } - } - - pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).is_empty(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).is_empty(rtxn) + for index in arroy_db_range_for_embedder(self.embedder_index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; + } } + Ok(()) } pub fn contains_item( @@ -139,11 +277,25 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).contains_item(rtxn, item) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).contains_item(rtxn, item) + for index in arroy_db_range_for_embedder(self.embedder_index) { + let contains = if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? + }; + if contains { + return Ok(contains); + } } + Ok(false) } pub fn nns_by_item( @@ -152,45 +304,108 @@ impl ArroyWrapper { item: ItemId, limit: usize, filter: Option<&RoaringBitmap>, - ) -> Result>, arroy::Error> { + ) -> Result, arroy::Error> { if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) + self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) + self._nns_by_item(rtxn, self.angular_db(), item, limit, filter) } } + fn _nns_by_item( + &self, + rtxn: &RoTxn, + db: arroy::Database, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self.readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { + results.append(&mut ret); + } else { + break; + } + } + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + Ok(results) + } + pub fn nns_by_vector( &self, - txn: &RoTxn, - item: &[f32], + rtxn: &RoTxn, + vector: &[f32], limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { if self.quantized { - arroy::Reader::open(txn, self.index, self.quantized_db())? - .nns_by_vector(txn, item, limit, None, None, filter) + self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) } else { - arroy::Reader::open(txn, self.index, self.angular_db())? - .nns_by_vector(txn, item, limit, None, None, filter) + self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter) } } - pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result>, arroy::Error> { + fn _nns_by_vector( + &self, + rtxn: &RoTxn, + db: arroy::Database, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self.readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); + } + + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) + } + + pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result>, arroy::Error> { + let mut vectors = Vec::new(); + if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())?.item_vector(rtxn, docid) + for reader in self.readers(rtxn, self.quantized_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } + } } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())?.item_vector(rtxn, docid) + for reader in self.readers(rtxn, self.angular_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } + } } + Ok(vectors) } - fn angular_db(&self) -> arroy::Database { + fn angular_db(&self) -> arroy::Database { self.database.remap_data_type() } - fn quantized_db(&self) -> arroy::Database { + fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } } diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 3bb7f09e6..d1cf364a2 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -417,6 +417,8 @@ impl EmbeddingSettings { pub const DISTRIBUTION: &'static str = "distribution"; + pub const BINARY_QUANTIZED: &'static str = "binaryQuantized"; + pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { match field { Self::SOURCE => &[ @@ -456,6 +458,13 @@ impl EmbeddingSettings { EmbedderSource::Rest, EmbedderSource::UserProvided, ], + Self::BINARY_QUANTIZED => &[ + EmbedderSource::HuggingFace, + EmbedderSource::Ollama, + EmbedderSource::OpenAi, + EmbedderSource::Rest, + EmbedderSource::UserProvided, + ], _other => unreachable!("unknown field"), } } @@ -470,6 +479,7 @@ impl EmbeddingSettings { Self::DIMENSIONS, Self::DISTRIBUTION, Self::URL, + Self::BINARY_QUANTIZED, ], EmbedderSource::HuggingFace => &[ Self::SOURCE, @@ -477,6 +487,7 @@ impl EmbeddingSettings { Self::REVISION, Self::DOCUMENT_TEMPLATE, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], EmbedderSource::Ollama => &[ Self::SOURCE, @@ -486,8 +497,11 @@ impl EmbeddingSettings { Self::API_KEY, Self::DIMENSIONS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], - EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION], + EmbedderSource::UserProvided => { + &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED] + } EmbedderSource::Rest => &[ Self::SOURCE, Self::API_KEY, @@ -498,6 +512,7 @@ impl EmbeddingSettings { Self::RESPONSE, Self::HEADERS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], } } diff --git a/workloads/search/embeddings-movies-subset-hf.json b/workloads/search/embeddings-movies-subset-hf.json index aeeecac59..36f45cfb9 100644 --- a/workloads/search/embeddings-movies-subset-hf.json +++ b/workloads/search/embeddings-movies-subset-hf.json @@ -77,7 +77,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } }, @@ -91,7 +92,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -105,7 +107,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.9 + "semanticRatio": 0.9, + "embedder": "default" } } }, @@ -119,7 +122,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -133,7 +137,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -147,7 +152,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -161,7 +167,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } },