Compare commits

..

No commits in common. "78828197957c90d82d1980608210fa722519b387" and "46fc6bbc2eecfcf5544a799d4c5ff7705ac10e95" have entirely different histories.

69 changed files with 3381 additions and 4263 deletions

View File

@ -43,7 +43,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -88,7 +88,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -41,7 +41,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -40,7 +40,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -40,7 +40,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -65,9 +65,9 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [macos-13, windows-2022] os: [macos-12, windows-2022]
include: include:
- os: macos-13 - os: macos-12
artifact_name: meilisearch artifact_name: meilisearch
asset_name: meilisearch-macos-amd64 asset_name: meilisearch-macos-amd64
- os: windows-2022 - os: windows-2022
@ -90,7 +90,7 @@ jobs:
publish-macos-apple-silicon: publish-macos-apple-silicon:
name: Publish binary for macOS silicon name: Publish binary for macOS silicon
runs-on: macos-13 runs-on: macos-12
needs: check-version needs: check-version
strategy: strategy:
matrix: matrix:

View File

@ -51,7 +51,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [macos-13, windows-2022] os: [macos-12, windows-2022]
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Cache dependencies - name: Cache dependencies

48
Cargo.lock generated
View File

@ -386,28 +386,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]] [[package]]
name = "arroy" name = "arroy"
version = "0.5.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d"
checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e"
dependencies = [
"bytemuck",
"byteorder",
"heed",
"log",
"memmap2",
"nohash",
"ordered-float",
"rand",
"rayon",
"roaring",
"tempfile",
"thiserror",
]
[[package]]
name = "arroy"
version = "0.5.0"
source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",
@ -726,9 +706,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]] [[package]]
name = "bytemuck" name = "bytemuck"
version = "1.19.0" version = "1.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
dependencies = [ dependencies = [
"bytemuck_derive", "bytemuck_derive",
] ]
@ -2575,7 +2555,7 @@ name = "index-scheduler"
version = "1.11.0" version = "1.11.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "arroy",
"big_s", "big_s",
"bincode", "bincode",
"crossbeam", "crossbeam",
@ -3434,7 +3414,6 @@ dependencies = [
"meilisearch-types", "meilisearch-types",
"mimalloc", "mimalloc",
"mime", "mime",
"mopa-maintained",
"num_cpus", "num_cpus",
"obkv", "obkv",
"once_cell", "once_cell",
@ -3537,7 +3516,6 @@ name = "meilitool"
version = "1.11.0" version = "1.11.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)",
"clap", "clap",
"dump", "dump",
"file-store", "file-store",
@ -3568,7 +3546,7 @@ dependencies = [
name = "milli" name = "milli"
version = "1.11.0" version = "1.11.0"
dependencies = [ dependencies = [
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "arroy",
"big_s", "big_s",
"bimap", "bimap",
"bincode", "bincode",
@ -3703,12 +3681,6 @@ dependencies = [
"syn 2.0.60", "syn 2.0.60",
] ]
[[package]]
name = "mopa-maintained"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11"
[[package]] [[package]]
name = "mutually_exclusive_features" name = "mutually_exclusive_features"
version = "0.0.3" version = "0.0.3"
@ -4610,8 +4582,9 @@ dependencies = [
[[package]] [[package]]
name = "rhai" name = "rhai"
version = "1.20.0" version = "1.19.0"
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702"
dependencies = [ dependencies = [
"ahash 0.8.11", "ahash 0.8.11",
"bitflags 2.6.0", "bitflags 2.6.0",
@ -4628,7 +4601,8 @@ dependencies = [
[[package]] [[package]]
name = "rhai_codegen" name = "rhai_codegen"
version = "2.2.0" version = "2.2.0"
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View File

@ -1,6 +1,6 @@
status = [ status = [
'Tests on ubuntu-20.04', 'Tests on ubuntu-20.04',
'Tests on macos-13', 'Tests on macos-12',
'Tests on windows-2022', 'Tests on windows-2022',
'Run Clippy', 'Run Clippy',
'Run Rustfmt', 'Run Rustfmt',

View File

@ -405,11 +405,11 @@ impl From<v5::Action> for v6::Action {
v5::Action::SettingsAll => v6::Action::SettingsAll, v5::Action::SettingsAll => v6::Action::SettingsAll,
v5::Action::SettingsGet => v6::Action::SettingsGet, v5::Action::SettingsGet => v6::Action::SettingsGet,
v5::Action::SettingsUpdate => v6::Action::SettingsUpdate, v5::Action::SettingsUpdate => v6::Action::SettingsUpdate,
v5::Action::StatsAll => v6::Action::StatsAll, v5::Action::StatsAll => v6::Action::StatsGet,
v5::Action::StatsGet => v6::Action::StatsGet, v5::Action::StatsGet => v6::Action::StatsGet,
v5::Action::MetricsAll => v6::Action::MetricsAll, v5::Action::MetricsAll => v6::Action::MetricsGet,
v5::Action::MetricsGet => v6::Action::MetricsGet, v5::Action::MetricsGet => v6::Action::MetricsGet,
v5::Action::DumpsAll => v6::Action::DumpsAll, v5::Action::DumpsAll => v6::Action::DumpsCreate,
v5::Action::DumpsCreate => v6::Action::DumpsCreate, v5::Action::DumpsCreate => v6::Action::DumpsCreate,
v5::Action::Version => v6::Action::Version, v5::Action::Version => v6::Action::Version,
v5::Action::KeysAdd => v6::Action::KeysAdd, v5::Action::KeysAdd => v6::Action::KeysAdd,

View File

@ -40,7 +40,7 @@ ureq = "2.10.0"
uuid = { version = "1.10.0", features = ["serde", "v4"] } uuid = { version = "1.10.0", features = ["serde", "v4"] }
[dev-dependencies] [dev-dependencies]
arroy = "0.5.0" arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
big_s = "1.0.2" big_s = "1.0.2"
crossbeam = "0.8.4" crossbeam = "0.8.4"
insta = { version = "1.39.0", features = ["json", "redactions"] } insta = { version = "1.39.0", features = ["json", "redactions"] }

View File

@ -1263,7 +1263,7 @@ impl IndexScheduler {
#[cfg(test)] #[cfg(test)]
self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?;
tracing::error!("Batch failed {}", error); tracing::info!("Batch failed {}", error);
self.update_task(&mut wtxn, &task) self.update_task(&mut wtxn, &task)
.map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?;

View File

@ -67,8 +67,5 @@ khmer = ["milli/khmer"]
vietnamese = ["milli/vietnamese"] vietnamese = ["milli/vietnamese"]
# force swedish character recomposition # force swedish character recomposition
swedish-recomposition = ["milli/swedish-recomposition"] swedish-recomposition = ["milli/swedish-recomposition"]
# allow german tokenization # force german character recomposition
german = ["milli/german"] german = ["milli/german"]
# allow turkish normalization
turkish = ["milli/turkish"]

View File

@ -183,7 +183,6 @@ fn parse_expiration_date(
bitflags! { bitflags! {
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)] #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
#[repr(transparent)] #[repr(transparent)]
// NOTE: For `Sequence` impl to work, the values of these must be in ascending order
pub struct Action: u32 { pub struct Action: u32 {
const Search = 1; const Search = 1;
// Documents // Documents
@ -207,19 +206,10 @@ bitflags! {
const SettingsGet = 1 << 12; const SettingsGet = 1 << 12;
const SettingsUpdate = 1 << 13; const SettingsUpdate = 1 << 13;
const SettingsAll = Self::SettingsGet.bits() | Self::SettingsUpdate.bits(); const SettingsAll = Self::SettingsGet.bits() | Self::SettingsUpdate.bits();
// Stats
const StatsGet = 1 << 14; const StatsGet = 1 << 14;
const StatsAll = Self::StatsGet.bits();
// Metrics
const MetricsGet = 1 << 15; const MetricsGet = 1 << 15;
const MetricsAll = Self::MetricsGet.bits();
// Dumps
const DumpsCreate = 1 << 16; const DumpsCreate = 1 << 16;
const DumpsAll = Self::DumpsCreate.bits();
// Snapshots
const SnapshotsCreate = 1 << 17; const SnapshotsCreate = 1 << 17;
const SnapshotsAll = Self::SnapshotsCreate.bits();
// Keys without an "all" version
const Version = 1 << 18; const Version = 1 << 18;
const KeysAdd = 1 << 19; const KeysAdd = 1 << 19;
const KeysGet = 1 << 20; const KeysGet = 1 << 20;
@ -227,13 +217,22 @@ bitflags! {
const KeysDelete = 1 << 22; const KeysDelete = 1 << 22;
const ExperimentalFeaturesGet = 1 << 23; const ExperimentalFeaturesGet = 1 << 23;
const ExperimentalFeaturesUpdate = 1 << 24; const ExperimentalFeaturesUpdate = 1 << 24;
// All const All = {
const All = 0xFFFFFFFF >> (32 - 1 - 24); let mut all = 0;
let mut exp = 0;
while exp <= 24 {
all = (all << 1) + 1;
exp += 1;
}
all
};
} }
} }
impl Action { impl Action {
const SERDE_MAP_ARR: [(&'static str, Self); 34] = [ const SERDE_MAP_ARR: [(&'static str, Self); 30] = [
("search", Self::Search), ("search", Self::Search),
("documents.add", Self::DocumentsAdd), ("documents.add", Self::DocumentsAdd),
("documents.get", Self::DocumentsGet), ("documents.get", Self::DocumentsGet),
@ -253,13 +252,9 @@ impl Action {
("settings.update", Self::SettingsUpdate), ("settings.update", Self::SettingsUpdate),
("settings.*", Self::SettingsAll), ("settings.*", Self::SettingsAll),
("stats.get", Self::StatsGet), ("stats.get", Self::StatsGet),
("stats.*", Self::StatsAll),
("metrics.get", Self::MetricsGet), ("metrics.get", Self::MetricsGet),
("metrics.*", Self::MetricsAll),
("dumps.create", Self::DumpsCreate), ("dumps.create", Self::DumpsCreate),
("dumps.*", Self::DumpsAll),
("snapshots.create", Self::SnapshotsCreate), ("snapshots.create", Self::SnapshotsCreate),
("snapshots.*", Self::SnapshotsAll),
("version", Self::Version), ("version", Self::Version),
("keys.create", Self::KeysAdd), ("keys.create", Self::KeysAdd),
("keys.get", Self::KeysGet), ("keys.get", Self::KeysGet),
@ -284,19 +279,6 @@ impl Action {
.map(|(serde_name, _)| serde_name) .map(|(serde_name, _)| serde_name)
.expect("an action is missing a matching serialized value") .expect("an action is missing a matching serialized value")
} }
// when we remove "all" flags, this will give us the exact index
fn get_potential_index(&self) -> usize {
if self.is_empty() {
return 0;
}
// most significant bit for u32
let msb = 1u32 << (31 - self.bits().leading_zeros());
// index of the single set bit
msb.trailing_zeros() as usize
}
} }
pub mod actions { pub mod actions {
@ -321,13 +303,9 @@ pub mod actions {
pub const SETTINGS_UPDATE: u32 = A::SettingsUpdate.bits(); pub const SETTINGS_UPDATE: u32 = A::SettingsUpdate.bits();
pub const SETTINGS_ALL: u32 = A::SettingsAll.bits(); pub const SETTINGS_ALL: u32 = A::SettingsAll.bits();
pub const STATS_GET: u32 = A::StatsGet.bits(); pub const STATS_GET: u32 = A::StatsGet.bits();
pub const STATS_ALL: u32 = A::StatsAll.bits();
pub const METRICS_GET: u32 = A::MetricsGet.bits(); pub const METRICS_GET: u32 = A::MetricsGet.bits();
pub const METRICS_ALL: u32 = A::MetricsAll.bits();
pub const DUMPS_CREATE: u32 = A::DumpsCreate.bits(); pub const DUMPS_CREATE: u32 = A::DumpsCreate.bits();
pub const DUMPS_ALL: u32 = A::DumpsAll.bits();
pub const SNAPSHOTS_CREATE: u32 = A::SnapshotsCreate.bits(); pub const SNAPSHOTS_CREATE: u32 = A::SnapshotsCreate.bits();
pub const SNAPSHOTS_ALL: u32 = A::SnapshotsAll.bits();
pub const VERSION: u32 = A::Version.bits(); pub const VERSION: u32 = A::Version.bits();
pub const KEYS_CREATE: u32 = A::KeysAdd.bits(); pub const KEYS_CREATE: u32 = A::KeysAdd.bits();
pub const KEYS_GET: u32 = A::KeysGet.bits(); pub const KEYS_GET: u32 = A::KeysGet.bits();
@ -335,7 +313,6 @@ pub mod actions {
pub const KEYS_DELETE: u32 = A::KeysDelete.bits(); pub const KEYS_DELETE: u32 = A::KeysDelete.bits();
pub const EXPERIMENTAL_FEATURES_GET: u32 = A::ExperimentalFeaturesGet.bits(); pub const EXPERIMENTAL_FEATURES_GET: u32 = A::ExperimentalFeaturesGet.bits();
pub const EXPERIMENTAL_FEATURES_UPDATE: u32 = A::ExperimentalFeaturesUpdate.bits(); pub const EXPERIMENTAL_FEATURES_UPDATE: u32 = A::ExperimentalFeaturesUpdate.bits();
pub const ALL: u32 = A::All.bits();
} }
impl<E: DeserializeError> Deserr<E> for Action { impl<E: DeserializeError> Deserr<E> for Action {
@ -404,55 +381,50 @@ impl<'de> Deserialize<'de> for Action {
} }
} }
// TODO: Once "all" type flags are removed, simplify
// Essentially `get_potential_index` will give the exact index, +1 the exact next, -1 the exact previous
impl Sequence for Action { impl Sequence for Action {
const CARDINALITY: usize = Self::FLAGS.len(); const CARDINALITY: usize = Self::FLAGS.len();
fn next(&self) -> Option<Self> { fn next(&self) -> Option<Self> {
let mut potential_next_index = self.get_potential_index() + 1; let mut iter = Self::FLAGS.iter();
while let Some(action) = iter.next() {
loop { if action.value() == self {
if let Some(next_flag) = Self::FLAGS.get(potential_next_index).map(|v| v.value()) { if let Some(action) = iter.next() {
if next_flag > self { return Some(*action.value());
return Some(*next_flag);
} }
potential_next_index += 1; break;
} else {
return None;
} }
} }
None
} }
fn previous(&self) -> Option<Self> { fn previous(&self) -> Option<Self> {
// -2 because of "all" type flags that represent a single flag, otherwise -1 would suffice let mut iter = Self::FLAGS.iter().peekable();
let initial_potential_index = self.get_potential_index();
if initial_potential_index == 0 { if let Some(action) = iter.next() {
if action.value() == self {
return None; return None;
} }
let mut potential_previous_index: usize = if let Some(next_action) = iter.peek() {
if initial_potential_index == 1 { 0 } else { initial_potential_index - 2 }; if next_action.value() == self {
return Some(*action.value());
let mut previous_item: Option<Self> = None;
let mut pre_previous_item: Option<Self> = None;
loop {
if let Some(next_flag) = Self::FLAGS.get(potential_previous_index).map(|v| v.value()) {
if next_flag > self {
return pre_previous_item;
}
pre_previous_item = previous_item;
previous_item = Some(*next_flag);
potential_previous_index += 1;
} else {
return pre_previous_item;
} }
} }
} }
while let Some(action) = iter.next() {
if let Some(next_action) = iter.peek() {
if next_action.value() == self {
return Some(*action.value());
}
}
}
None
}
fn first() -> Option<Self> { fn first() -> Option<Self> {
Self::FLAGS.first().map(|v| *v.value()) Self::FLAGS.first().map(|v| *v.value())
} }

View File

@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [
rustls = { version = "0.23.11", features = ["ring"], default-features = false } rustls = { version = "0.23.11", features = ["ring"], default-features = false }
rustls-pki-types = { version = "1.7.0", features = ["alloc"] } rustls-pki-types = { version = "1.7.0", features = ["alloc"] }
rustls-pemfile = "2.1.2" rustls-pemfile = "2.1.2"
segment = { version = "0.2.4" } segment = { version = "0.2.4", optional = true }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }
serde_json = { version = "1.0.120", features = ["preserve_order"] } serde_json = { version = "1.0.120", features = ["preserve_order"] }
sha2 = "0.10.8" sha2 = "0.10.8"
@ -104,7 +104,6 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
tracing-actix-web = "0.7.11" tracing-actix-web = "0.7.11"
build-info = { version = "1.7.0", path = "../build-info" } build-info = { version = "1.7.0", path = "../build-info" }
roaring = "0.10.2" roaring = "0.10.2"
mopa-maintained = "0.2.3"
[dev-dependencies] [dev-dependencies]
actix-rt = "2.10.0" actix-rt = "2.10.0"
@ -132,7 +131,8 @@ tempfile = { version = "3.10.1", optional = true }
zip = { version = "2.1.3", optional = true } zip = { version = "2.1.3", optional = true }
[features] [features]
default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
analytics = ["segment"]
mini-dashboard = [ mini-dashboard = [
"static-files", "static-files",
"anyhow", "anyhow",
@ -154,8 +154,7 @@ khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"] vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
german = ["meilisearch-types/german"] german = ["meilisearch-types/german"]
turkish = ["meilisearch-types/turkish"]
[package.metadata.mini-dashboard] [package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"

View File

@ -1,45 +1,44 @@
pub mod segment_analytics; mod mock_analytics;
#[cfg(feature = "analytics")]
mod segment_analytics;
use std::fs; use std::fs;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc;
use actix_web::HttpRequest; use actix_web::HttpRequest;
use index_scheduler::IndexScheduler;
use meilisearch_auth::AuthController;
use meilisearch_types::InstanceUid; use meilisearch_types::InstanceUid;
use mopa::mopafy; pub use mock_analytics::MockAnalytics;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use platform_dirs::AppDirs; use platform_dirs::AppDirs;
use serde_json::Value;
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
// if the analytics feature is disabled
// the `SegmentAnalytics` point to the mock instead of the real analytics
#[cfg(not(feature = "analytics"))]
pub type SegmentAnalytics = mock_analytics::MockAnalytics;
#[cfg(not(feature = "analytics"))]
pub type SearchAggregator = mock_analytics::SearchAggregator;
#[cfg(not(feature = "analytics"))]
pub type SimilarAggregator = mock_analytics::SimilarAggregator;
#[cfg(not(feature = "analytics"))]
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
#[cfg(not(feature = "analytics"))]
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
// if the feature analytics is enabled we use the real analytics // if the feature analytics is enabled we use the real analytics
#[cfg(feature = "analytics")]
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
#[cfg(feature = "analytics")]
use crate::Opt; pub type SearchAggregator = segment_analytics::SearchAggregator;
#[cfg(feature = "analytics")]
/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. pub type SimilarAggregator = segment_analytics::SimilarAggregator;
#[macro_export] #[cfg(feature = "analytics")]
macro_rules! empty_analytics { pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
($struct_name:ident, $event_name:literal) => { #[cfg(feature = "analytics")]
#[derive(Default)] pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
struct $struct_name {}
impl $crate::analytics::Aggregate for $struct_name {
fn event_name(&self) -> &'static str {
$event_name
}
fn aggregate(self: Box<Self>, _other: Box<Self>) -> Box<Self> {
self
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::json!({})
}
}
};
}
/// The Meilisearch config dir: /// The Meilisearch config dir:
/// `~/.config/Meilisearch` on *NIX or *BSD. /// `~/.config/Meilisearch` on *NIX or *BSD.
@ -79,88 +78,60 @@ pub enum DocumentFetchKind {
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
} }
/// To send an event to segment, your event must be able to aggregate itself with another event of the same type. pub trait Analytics: Sync + Send {
pub trait Aggregate: 'static + mopa::Any + Send { fn instance_uid(&self) -> Option<&InstanceUid>;
/// The name of the event that will be sent to segment.
fn event_name(&self) -> &'static str;
/// Will be called every time an event has been used twice before segment flushed its buffer.
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self>
where
Self: Sized;
/// Converts your structure to the final event that'll be sent to segment.
fn into_event(self: Box<Self>) -> serde_json::Value;
}
mopafy!(Aggregate);
/// Helper trait to define multiple aggregates with the same content but a different name.
/// Commonly used when you must aggregate a search with POST or with GET, for example.
pub trait AggregateMethod: 'static + Default + Send {
fn event_name() -> &'static str;
}
/// A macro used to quickly define multiple aggregate method with their name
/// Usage:
/// ```rust
/// use meilisearch::aggregate_methods;
///
/// aggregate_methods!(
/// SearchGET => "Documents Searched GET",
/// SearchPOST => "Documents Searched POST",
/// );
/// ```
#[macro_export]
macro_rules! aggregate_methods {
($method:ident => $event_name:literal) => {
#[derive(Default)]
pub struct $method {}
impl $crate::analytics::AggregateMethod for $method {
fn event_name() -> &'static str {
$event_name
}
}
};
($($method:ident => $event_name:literal,)+) => {
$(
aggregate_methods!($method => $event_name);
)+
};
}
#[derive(Clone)]
pub struct Analytics {
segment: Option<Arc<SegmentAnalytics>>,
}
impl Analytics {
pub async fn new(
opt: &Opt,
index_scheduler: Arc<IndexScheduler>,
auth_controller: Arc<AuthController>,
) -> Self {
if opt.no_analytics {
Self { segment: None }
} else {
Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await }
}
}
pub fn no_analytics() -> Self {
Self { segment: None }
}
pub fn instance_uid(&self) -> Option<&InstanceUid> {
self.segment.as_ref().map(|segment| segment.instance_uid.as_ref())
}
/// The method used to publish most analytics that do not need to be batched every hours /// The method used to publish most analytics that do not need to be batched every hours
pub fn publish<T: Aggregate>(&self, event: T, request: &HttpRequest) { fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>);
if let Some(ref segment) = self.segment {
let _ = segment.sender.try_send(segment_analytics::Message::new(event, request)); /// This method should be called to aggregate a get search
} fn get_search(&self, aggregate: SearchAggregator);
}
/// This method should be called to aggregate a post search
fn post_search(&self, aggregate: SearchAggregator);
/// This method should be called to aggregate a get similar request
fn get_similar(&self, aggregate: SimilarAggregator);
/// This method should be called to aggregate a post similar request
fn post_similar(&self, aggregate: SimilarAggregator);
/// This method should be called to aggregate a post array of searches
fn post_multi_search(&self, aggregate: MultiSearchAggregator);
/// This method should be called to aggregate post facet values searches
fn post_facet_search(&self, aggregate: FacetSearchAggregator);
// this method should be called to aggregate an add documents request
fn add_documents(
&self,
documents_query: &UpdateDocumentsQuery,
index_creation: bool,
request: &HttpRequest,
);
// this method should be called to aggregate a fetch documents request
fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
// this method should be called to aggregate a fetch documents request
fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
// this method should be called to aggregate a add documents request
fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest);
// this method should be called to batch an update documents request
fn update_documents(
&self,
documents_query: &UpdateDocumentsQuery,
index_creation: bool,
request: &HttpRequest,
);
// this method should be called to batch an update documents by function request
fn update_documents_by_function(
&self,
documents_query: &DocumentEditionByFunction,
index_creation: bool,
request: &HttpRequest,
);
} }

File diff suppressed because it is too large Load Diff

View File

@ -120,7 +120,7 @@ pub fn create_app(
search_queue: Data<SearchQueue>, search_queue: Data<SearchQueue>,
opt: Opt, opt: Opt,
logs: (LogRouteHandle, LogStderrHandle), logs: (LogRouteHandle, LogStderrHandle),
analytics: Data<Analytics>, analytics: Arc<dyn Analytics>,
enable_dashboard: bool, enable_dashboard: bool,
) -> actix_web::App< ) -> actix_web::App<
impl ServiceFactory< impl ServiceFactory<
@ -473,14 +473,14 @@ pub fn configure_data(
search_queue: Data<SearchQueue>, search_queue: Data<SearchQueue>,
opt: &Opt, opt: &Opt,
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
analytics: Data<Analytics>, analytics: Arc<dyn Analytics>,
) { ) {
let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize; let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize;
config config
.app_data(index_scheduler) .app_data(index_scheduler)
.app_data(auth) .app_data(auth)
.app_data(search_queue) .app_data(search_queue)
.app_data(analytics) .app_data(web::Data::from(analytics))
.app_data(web::Data::new(logs_route)) .app_data(web::Data::new(logs_route))
.app_data(web::Data::new(logs_stderr)) .app_data(web::Data::new(logs_stderr))
.app_data(web::Data::new(opt.clone())) .app_data(web::Data::new(opt.clone()))

View File

@ -5,7 +5,6 @@ use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
use std::thread::available_parallelism; use std::thread::available_parallelism;
use std::time::Duration;
use actix_web::http::KeepAlive; use actix_web::http::KeepAlive;
use actix_web::web::Data; use actix_web::web::Data;
@ -124,12 +123,19 @@ async fn try_main() -> anyhow::Result<()> {
let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?; let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?;
let analytics = #[cfg(all(not(debug_assertions), feature = "analytics"))]
analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await; let analytics = if !opt.no_analytics {
analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone())
.await
} else {
analytics::MockAnalytics::new(&opt)
};
#[cfg(any(debug_assertions, not(feature = "analytics")))]
let analytics = analytics::MockAnalytics::new(&opt);
print_launch_resume(&opt, analytics.clone(), config_read_from); print_launch_resume(&opt, analytics.clone(), config_read_from);
run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?;
Ok(()) Ok(())
} }
@ -139,23 +145,16 @@ async fn run_http(
auth_controller: Arc<AuthController>, auth_controller: Arc<AuthController>,
opt: Opt, opt: Opt,
logs: (LogRouteHandle, LogStderrHandle), logs: (LogRouteHandle, LogStderrHandle),
analytics: Arc<Analytics>, analytics: Arc<dyn Analytics>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let enable_dashboard = &opt.env == "development"; let enable_dashboard = &opt.env == "development";
let opt_clone = opt.clone(); let opt_clone = opt.clone();
let index_scheduler = Data::from(index_scheduler); let index_scheduler = Data::from(index_scheduler);
let auth_controller = Data::from(auth_controller); let auth_controller = Data::from(auth_controller);
let analytics = Data::from(analytics);
let search_queue = SearchQueue::new( let search_queue = SearchQueue::new(
opt.experimental_search_queue_size, opt.experimental_search_queue_size,
available_parallelism() available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()),
.unwrap_or(NonZeroUsize::new(2).unwrap()) );
.checked_mul(opt.experimental_nb_searches_per_core)
.unwrap_or(NonZeroUsize::MAX),
)
.with_time_to_abort(Duration::from_secs(
usize::from(opt.experimental_drop_search_after) as u64
));
let search_queue = Data::new(search_queue); let search_queue = Data::new(search_queue);
let http_server = HttpServer::new(move || { let http_server = HttpServer::new(move || {
@ -181,7 +180,11 @@ async fn run_http(
Ok(()) Ok(())
} }
pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option<PathBuf>) { pub fn print_launch_resume(
opt: &Opt,
analytics: Arc<dyn Analytics>,
config_read_from: Option<PathBuf>,
) {
let build_info = build_info::BuildInfo::from_build(); let build_info = build_info::BuildInfo::from_build();
let protocol = let protocol =
@ -223,6 +226,7 @@ pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Op
eprintln!("Prototype:\t\t{:?}", prototype); eprintln!("Prototype:\t\t{:?}", prototype);
} }
#[cfg(all(not(debug_assertions), feature = "analytics"))]
{ {
if !opt.no_analytics { if !opt.no_analytics {
eprintln!( eprintln!(

View File

@ -2,7 +2,7 @@ use std::env::VarError;
use std::ffi::OsStr; use std::ffi::OsStr;
use std::fmt::Display; use std::fmt::Display;
use std::io::{BufReader, Read}; use std::io::{BufReader, Read};
use std::num::{NonZeroUsize, ParseIntError}; use std::num::ParseIntError;
use std::ops::Deref; use std::ops::Deref;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
@ -29,6 +29,7 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_ENV: &str = "MEILI_ENV";
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
#[cfg(feature = "analytics")]
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH"; const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
@ -54,8 +55,6 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO
const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER";
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE"; const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE";
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
@ -209,6 +208,7 @@ pub struct Opt {
/// Meilisearch automatically collects data from all instances that do not opt out using this flag. /// Meilisearch automatically collects data from all instances that do not opt out using this flag.
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted /// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
/// at any time. /// at any time.
#[cfg(feature = "analytics")]
#[serde(default)] // we can't send true #[serde(default)] // we can't send true
#[clap(long, env = MEILI_NO_ANALYTICS)] #[clap(long, env = MEILI_NO_ANALYTICS)]
pub no_analytics: bool, pub no_analytics: bool,
@ -357,26 +357,10 @@ pub struct Opt {
/// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the /// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the
/// queue is full it starts returning HTTP 503, Service Unavailable. /// queue is full it starts returning HTTP 503, Service Unavailable.
/// The default value is 1000. /// The default value is 1000.
#[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())] #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)]
#[serde(default = "default_experimental_search_queue_size")] #[serde(default)]
pub experimental_search_queue_size: usize, pub experimental_search_queue_size: usize,
/// Experimental drop search after. For more information, see: <https://github.com/orgs/meilisearch/discussions/783>
///
/// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it.
/// The default value is 60.
#[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())]
#[serde(default = "default_drop_search_after")]
pub experimental_drop_search_after: NonZeroUsize,
/// Experimental number of searches per core. For more information, see: <https://github.com/orgs/meilisearch/discussions/784>
///
/// Lets you customize how many search requests can run on each core concurrently.
/// The default value is 4.
#[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())]
#[serde(default = "default_nb_searches_per_core")]
pub experimental_nb_searches_per_core: NonZeroUsize,
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723> /// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
/// ///
/// Change the mode of the logs on the console. /// Change the mode of the logs on the console.
@ -423,6 +407,7 @@ pub struct Opt {
impl Opt { impl Opt {
/// Whether analytics should be enabled or not. /// Whether analytics should be enabled or not.
#[cfg(all(not(debug_assertions), feature = "analytics"))]
pub fn analytics(&self) -> bool { pub fn analytics(&self) -> bool {
!self.no_analytics !self.no_analytics
} }
@ -502,12 +487,11 @@ impl Opt {
ignore_missing_dump: _, ignore_missing_dump: _,
ignore_dump_if_db_exists: _, ignore_dump_if_db_exists: _,
config_file_path: _, config_file_path: _,
#[cfg(feature = "analytics")]
no_analytics, no_analytics,
experimental_contains_filter, experimental_contains_filter,
experimental_enable_metrics, experimental_enable_metrics,
experimental_search_queue_size, experimental_search_queue_size,
experimental_drop_search_after,
experimental_nb_searches_per_core,
experimental_logs_mode, experimental_logs_mode,
experimental_enable_logs_route, experimental_enable_logs_route,
experimental_replication_parameters, experimental_replication_parameters,
@ -529,7 +513,10 @@ impl Opt {
); );
} }
#[cfg(feature = "analytics")]
{
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
}
export_to_env_if_not_present( export_to_env_if_not_present(
MEILI_HTTP_PAYLOAD_SIZE_LIMIT, MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
http_payload_size_limit.to_string(), http_payload_size_limit.to_string(),
@ -572,14 +559,6 @@ impl Opt {
MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE,
experimental_search_queue_size.to_string(), experimental_search_queue_size.to_string(),
); );
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER,
experimental_drop_search_after.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE,
experimental_nb_searches_per_core.to_string(),
);
export_to_env_if_not_present( export_to_env_if_not_present(
MEILI_EXPERIMENTAL_LOGS_MODE, MEILI_EXPERIMENTAL_LOGS_MODE,
experimental_logs_mode.to_string(), experimental_logs_mode.to_string(),
@ -911,18 +890,6 @@ fn default_dump_dir() -> PathBuf {
PathBuf::from(DEFAULT_DUMP_DIR) PathBuf::from(DEFAULT_DUMP_DIR)
} }
fn default_experimental_search_queue_size() -> usize {
1000
}
fn default_drop_search_after() -> NonZeroUsize {
NonZeroUsize::new(60).unwrap()
}
fn default_nb_searches_per_core() -> NonZeroUsize {
NonZeroUsize::new(4).unwrap()
}
/// Indicates if a snapshot was scheduled, and if yes with which interval. /// Indicates if a snapshot was scheduled, and if yes with which interval.
#[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)] #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
pub enum ScheduleSnapshot { pub enum ScheduleSnapshot {

View File

@ -4,6 +4,7 @@ use index_scheduler::IndexScheduler;
use meilisearch_auth::AuthController; use meilisearch_auth::AuthController;
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
use serde_json::json;
use tracing::debug; use tracing::debug;
use crate::analytics::Analytics; use crate::analytics::Analytics;
@ -17,16 +18,14 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
} }
crate::empty_analytics!(DumpAnalytics, "Dump Created");
pub async fn create_dump( pub async fn create_dump(
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>, auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
analytics.publish(DumpAnalytics::default(), &req); analytics.publish("Dump Created".to_string(), json!({}), Some(&req));
let task = KindWithContent::DumpCreation { let task = KindWithContent::DumpCreation {
keys: auth_controller.list_keys()?, keys: auth_controller.list_keys()?,

View File

@ -6,10 +6,10 @@ use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::keys::actions; use meilisearch_types::keys::actions;
use serde::Serialize; use serde_json::json;
use tracing::debug; use tracing::debug;
use crate::analytics::{Aggregate, Analytics}; use crate::analytics::Analytics;
use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::policies::ActionPolicy;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
@ -17,7 +17,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
pub fn configure(cfg: &mut web::ServiceConfig) { pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service( cfg.service(
web::resource("") web::resource("")
.route(web::get().to(get_features)) .route(web::get().to(SeqHandler(get_features)))
.route(web::patch().to(SeqHandler(patch_features))), .route(web::patch().to(SeqHandler(patch_features))),
); );
} }
@ -27,9 +27,12 @@ async fn get_features(
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>,
Data<IndexScheduler>, Data<IndexScheduler>,
>, >,
req: HttpRequest,
analytics: Data<dyn Analytics>,
) -> HttpResponse { ) -> HttpResponse {
let features = index_scheduler.features(); let features = index_scheduler.features();
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
let features = features.runtime_features(); let features = features.runtime_features();
debug!(returns = ?features, "Get features"); debug!(returns = ?features, "Get features");
HttpResponse::Ok().json(features) HttpResponse::Ok().json(features)
@ -50,35 +53,6 @@ pub struct RuntimeTogglableFeatures {
pub contains_filter: Option<bool>, pub contains_filter: Option<bool>,
} }
#[derive(Serialize)]
pub struct PatchExperimentalFeatureAnalytics {
vector_store: bool,
metrics: bool,
logs_route: bool,
edit_documents_by_function: bool,
contains_filter: bool,
}
impl Aggregate for PatchExperimentalFeatureAnalytics {
fn event_name(&self) -> &'static str {
"Experimental features Updated"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
vector_store: new.vector_store,
metrics: new.metrics,
logs_route: new.logs_route,
edit_documents_by_function: new.edit_documents_by_function,
contains_filter: new.contains_filter,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
async fn patch_features( async fn patch_features(
index_scheduler: GuardedData< index_scheduler: GuardedData<
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>, ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>,
@ -86,7 +60,7 @@ async fn patch_features(
>, >,
new_features: AwebJson<RuntimeTogglableFeatures, DeserrJsonError>, new_features: AwebJson<RuntimeTogglableFeatures, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: Data<Analytics>, analytics: Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let features = index_scheduler.features(); let features = index_scheduler.features();
debug!(parameters = ?new_features, "Patch features"); debug!(parameters = ?new_features, "Patch features");
@ -115,14 +89,15 @@ async fn patch_features(
} = new_features; } = new_features;
analytics.publish( analytics.publish(
PatchExperimentalFeatureAnalytics { "Experimental features Updated".to_string(),
vector_store, json!({
metrics, "vector_store": vector_store,
logs_route, "metrics": metrics,
edit_documents_by_function, "logs_route": logs_route,
contains_filter, "edit_documents_by_function": edit_documents_by_function,
}, "contains_filter": contains_filter,
&req, }),
Some(&req),
); );
index_scheduler.put_runtime_features(new_features)?; index_scheduler.put_runtime_features(new_features)?;
debug!(returns = ?new_features, "Patch features"); debug!(returns = ?new_features, "Patch features");

View File

@ -1,6 +1,4 @@
use std::collections::HashSet;
use std::io::ErrorKind; use std::io::ErrorKind;
use std::marker::PhantomData;
use actix_web::http::header::CONTENT_TYPE; use actix_web::http::header::CONTENT_TYPE;
use actix_web::web::Data; use actix_web::web::Data;
@ -25,14 +23,14 @@ use meilisearch_types::tasks::KindWithContent;
use meilisearch_types::{milli, Document, Index}; use meilisearch_types::{milli, Document, Index};
use mime::Mime; use mime::Mime;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize}; use serde::Deserialize;
use serde_json::Value; use serde_json::Value;
use tempfile::tempfile; use tempfile::tempfile;
use tokio::fs::File; use tokio::fs::File;
use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter};
use tracing::debug; use tracing::debug;
use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::error::PayloadError::ReceivePayload; use crate::error::PayloadError::ReceivePayload;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
@ -43,7 +41,7 @@ use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
}; };
use crate::search::{parse_filter, RetrieveVectors}; use crate::search::{parse_filter, RetrieveVectors};
use crate::{aggregate_methods, Opt}; use crate::Opt;
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| { static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
@ -102,84 +100,12 @@ pub struct GetDocument {
retrieve_vectors: Param<bool>, retrieve_vectors: Param<bool>,
} }
aggregate_methods!(
DocumentsGET => "Documents Fetched GET",
DocumentsPOST => "Documents Fetched POST",
);
#[derive(Serialize)]
pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
// a call on ../documents/:doc_id
per_document_id: bool,
// if a filter was used
per_filter: bool,
#[serde(rename = "vector.retrieve_vectors")]
retrieve_vectors: bool,
// pagination
#[serde(rename = "pagination.max_limit")]
max_limit: usize,
#[serde(rename = "pagination.max_offset")]
max_offset: usize,
marker: std::marker::PhantomData<Method>,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum DocumentFetchKind {
PerDocumentId { retrieve_vectors: bool },
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
}
impl<Method: AggregateMethod> DocumentsFetchAggregator<Method> {
pub fn from_query(query: &DocumentFetchKind) -> Self {
let (limit, offset, retrieve_vectors) = match query {
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
(*limit, *offset, *retrieve_vectors)
}
};
Self {
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
max_limit: limit,
max_offset: offset,
retrieve_vectors,
marker: PhantomData,
}
}
}
impl<Method: AggregateMethod> Aggregate for DocumentsFetchAggregator<Method> {
fn event_name(&self) -> &'static str {
Method::event_name()
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
per_document_id: self.per_document_id | new.per_document_id,
per_filter: self.per_filter | new.per_filter,
retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors,
max_limit: self.max_limit.max(new.max_limit),
max_offset: self.max_offset.max(new.max_offset),
marker: PhantomData,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn get_document( pub async fn get_document(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
document_param: web::Path<DocumentParam>, document_param: web::Path<DocumentParam>,
params: AwebQueryParameter<GetDocument, DeserrQueryParamError>, params: AwebQueryParameter<GetDocument, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let DocumentParam { index_uid, document_id } = document_param.into_inner(); let DocumentParam { index_uid, document_id } = document_param.into_inner();
debug!(parameters = ?params, "Get document"); debug!(parameters = ?params, "Get document");
@ -191,15 +117,8 @@ pub async fn get_document(
let features = index_scheduler.features(); let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
analytics.publish( analytics.get_fetch_documents(
DocumentsFetchAggregator::<DocumentsGET> { &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
retrieve_vectors: param_retrieve_vectors.0,
per_document_id: true,
per_filter: false,
max_limit: 0,
max_offset: 0,
marker: PhantomData,
},
&req, &req,
); );
@ -210,52 +129,17 @@ pub async fn get_document(
Ok(HttpResponse::Ok().json(document)) Ok(HttpResponse::Ok().json(document))
} }
#[derive(Serialize)]
pub struct DocumentsDeletionAggregator {
per_document_id: bool,
clear_all: bool,
per_batch: bool,
per_filter: bool,
}
impl Aggregate for DocumentsDeletionAggregator {
fn event_name(&self) -> &'static str {
"Documents Deleted"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
per_document_id: self.per_document_id | new.per_document_id,
clear_all: self.clear_all | new.clear_all,
per_batch: self.per_batch | new.per_batch,
per_filter: self.per_filter | new.per_filter,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn delete_document( pub async fn delete_document(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
path: web::Path<DocumentParam>, path: web::Path<DocumentParam>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let DocumentParam { index_uid, document_id } = path.into_inner(); let DocumentParam { index_uid, document_id } = path.into_inner();
let index_uid = IndexUid::try_from(index_uid)?; let index_uid = IndexUid::try_from(index_uid)?;
analytics.publish( analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req);
DocumentsDeletionAggregator {
per_document_id: true,
clear_all: false,
per_batch: false,
per_filter: false,
},
&req,
);
let task = KindWithContent::DocumentDeletion { let task = KindWithContent::DocumentDeletion {
index_uid: index_uid.to_string(), index_uid: index_uid.to_string(),
@ -306,19 +190,17 @@ pub async fn documents_by_query_post(
index_uid: web::Path<String>, index_uid: web::Path<String>,
body: AwebJson<BrowseQuery, DeserrJsonError>, body: AwebJson<BrowseQuery, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let body = body.into_inner(); let body = body.into_inner();
debug!(parameters = ?body, "Get documents POST"); debug!(parameters = ?body, "Get documents POST");
analytics.publish( analytics.post_fetch_documents(
DocumentsFetchAggregator::<DocumentsPOST> { &DocumentFetchKind::Normal {
per_filter: body.filter.is_some(), with_filter: body.filter.is_some(),
limit: body.limit,
offset: body.offset,
retrieve_vectors: body.retrieve_vectors, retrieve_vectors: body.retrieve_vectors,
max_limit: body.limit,
max_offset: body.offset,
per_document_id: false,
marker: PhantomData,
}, },
&req, &req,
); );
@ -331,7 +213,7 @@ pub async fn get_documents(
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>, params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Get documents GET"); debug!(parameters = ?params, "Get documents GET");
@ -353,14 +235,12 @@ pub async fn get_documents(
filter, filter,
}; };
analytics.publish( analytics.get_fetch_documents(
DocumentsFetchAggregator::<DocumentsGET> { &DocumentFetchKind::Normal {
per_filter: query.filter.is_some(), with_filter: query.filter.is_some(),
limit: query.limit,
offset: query.offset,
retrieve_vectors: query.retrieve_vectors, retrieve_vectors: query.retrieve_vectors,
max_limit: query.limit,
max_offset: query.offset,
per_document_id: false,
marker: PhantomData,
}, },
&req, &req,
); );
@ -418,39 +298,6 @@ fn from_char_csv_delimiter(
} }
} }
aggregate_methods!(
Replaced => "Documents Added",
Updated => "Documents Updated",
);
#[derive(Serialize)]
pub struct DocumentsAggregator<T: AggregateMethod> {
payload_types: HashSet<String>,
primary_key: HashSet<String>,
index_creation: bool,
#[serde(skip)]
method: PhantomData<T>,
}
impl<Method: AggregateMethod> Aggregate for DocumentsAggregator<Method> {
fn event_name(&self) -> &'static str {
Method::event_name()
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
payload_types: self.payload_types.union(&new.payload_types).cloned().collect(),
primary_key: self.primary_key.union(&new.primary_key).cloned().collect(),
index_creation: self.index_creation | new.index_creation,
method: PhantomData,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(self).unwrap_or_default()
}
}
pub async fn replace_documents( pub async fn replace_documents(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
@ -458,32 +305,16 @@ pub async fn replace_documents(
body: Payload, body: Payload,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
debug!(parameters = ?params, "Replace documents"); debug!(parameters = ?params, "Replace documents");
let params = params.into_inner(); let params = params.into_inner();
let mut content_types = HashSet::new(); analytics.add_documents(
let content_type = req &params,
.headers() index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
.get(CONTENT_TYPE)
.and_then(|s| s.to_str().ok())
.unwrap_or("unknown")
.to_string();
content_types.insert(content_type);
let mut primary_keys = HashSet::new();
if let Some(primary_key) = params.primary_key.clone() {
primary_keys.insert(primary_key);
}
analytics.publish(
DocumentsAggregator::<Replaced> {
payload_types: content_types,
primary_key: primary_keys,
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
method: PhantomData,
},
&req, &req,
); );
@ -515,32 +346,16 @@ pub async fn update_documents(
body: Payload, body: Payload,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let params = params.into_inner(); let params = params.into_inner();
debug!(parameters = ?params, "Update documents"); debug!(parameters = ?params, "Update documents");
let mut content_types = HashSet::new(); analytics.add_documents(
let content_type = req &params,
.headers() index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
.get(CONTENT_TYPE)
.and_then(|s| s.to_str().ok())
.unwrap_or("unknown")
.to_string();
content_types.insert(content_type);
let mut primary_keys = HashSet::new();
if let Some(primary_key) = params.primary_key.clone() {
primary_keys.insert(primary_key);
}
analytics.publish(
DocumentsAggregator::<Updated> {
payload_types: content_types,
primary_key: primary_keys,
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
method: PhantomData,
},
&req, &req,
); );
@ -709,20 +524,12 @@ pub async fn delete_documents_batch(
body: web::Json<Vec<Value>>, body: web::Json<Vec<Value>>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Delete documents by batch"); debug!(parameters = ?body, "Delete documents by batch");
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.publish( analytics.delete_documents(DocumentDeletionKind::PerBatch, &req);
DocumentsDeletionAggregator {
per_batch: true,
per_document_id: false,
clear_all: false,
per_filter: false,
},
&req,
);
let ids = body let ids = body
.iter() .iter()
@ -755,22 +562,14 @@ pub async fn delete_documents_by_filter(
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>, body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Delete documents by filter"); debug!(parameters = ?body, "Delete documents by filter");
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let index_uid = index_uid.into_inner(); let index_uid = index_uid.into_inner();
let filter = body.into_inner().filter; let filter = body.into_inner().filter;
analytics.publish( analytics.delete_documents(DocumentDeletionKind::PerFilter, &req);
DocumentsDeletionAggregator {
per_filter: true,
per_document_id: false,
clear_all: false,
per_batch: false,
},
&req,
);
// we ensure the filter is well formed before enqueuing it // we ensure the filter is well formed before enqueuing it
crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())? crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())?
@ -800,41 +599,13 @@ pub struct DocumentEditionByFunction {
pub function: String, pub function: String,
} }
#[derive(Serialize)]
struct EditDocumentsByFunctionAggregator {
// Set to true if at least one request was filtered
filtered: bool,
// Set to true if at least one request contained a context
with_context: bool,
index_creation: bool,
}
impl Aggregate for EditDocumentsByFunctionAggregator {
fn event_name(&self) -> &'static str {
"Documents Edited By Function"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
filtered: self.filtered | new.filtered,
with_context: self.with_context | new.with_context,
index_creation: self.index_creation | new.index_creation,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn edit_documents_by_function( pub async fn edit_documents_by_function(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ALL }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ALL }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebJson<DocumentEditionByFunction, DeserrJsonError>, params: AwebJson<DocumentEditionByFunction, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Edit documents by function"); debug!(parameters = ?params, "Edit documents by function");
@ -846,12 +617,9 @@ pub async fn edit_documents_by_function(
let index_uid = index_uid.into_inner(); let index_uid = index_uid.into_inner();
let params = params.into_inner(); let params = params.into_inner();
analytics.publish( analytics.update_documents_by_function(
EditDocumentsByFunctionAggregator { &params,
filtered: params.filter.is_some(), index_scheduler.index(&index_uid).is_err(),
with_context: params.context.is_some(),
index_creation: index_scheduler.index(&index_uid).is_err(),
},
&req, &req,
); );
@ -902,18 +670,10 @@ pub async fn clear_all_documents(
index_uid: web::Path<String>, index_uid: web::Path<String>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.publish( analytics.delete_documents(DocumentDeletionKind::ClearAll, &req);
DocumentsDeletionAggregator {
clear_all: true,
per_document_id: false,
per_batch: false,
per_filter: false,
},
&req,
);
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
let uid = get_task_id(&req, &opt)?; let uid = get_task_id(&req, &opt)?;

View File

@ -1,5 +1,3 @@
use std::collections::{BinaryHeap, HashSet};
use actix_web::web::Data; use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse}; use actix_web::{web, HttpRequest, HttpResponse};
use deserr::actix_web::AwebJson; use deserr::actix_web::AwebJson;
@ -12,15 +10,14 @@ use meilisearch_types::locales::Locale;
use serde_json::Value; use serde_json::Value;
use tracing::debug; use tracing::debug;
use crate::analytics::{Aggregate, Analytics}; use crate::analytics::{Analytics, FacetSearchAggregator};
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::routes::indexes::search::search_kind; use crate::routes::indexes::search::search_kind;
use crate::search::{ use crate::search::{
add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy, add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
DEFAULT_SEARCH_OFFSET,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -56,122 +53,20 @@ pub struct FacetSearchQuery {
pub locales: Option<Vec<Locale>>, pub locales: Option<Vec<Locale>>,
} }
#[derive(Default)]
pub struct FacetSearchAggregator {
// requests
total_received: usize,
total_succeeded: usize,
time_spent: BinaryHeap<usize>,
// The set of all facetNames that were used
facet_names: HashSet<String>,
// As there been any other parameter than the facetName or facetQuery ones?
additional_search_parameters_provided: bool,
}
impl FacetSearchAggregator {
#[allow(clippy::field_reassign_with_default)]
pub fn from_query(query: &FacetSearchQuery) -> Self {
let FacetSearchQuery {
facet_query: _,
facet_name,
vector,
q,
filter,
matching_strategy,
attributes_to_search_on,
hybrid,
ranking_score_threshold,
locales,
} = query;
Self {
total_received: 1,
facet_names: Some(facet_name.clone()).into_iter().collect(),
additional_search_parameters_provided: q.is_some()
|| vector.is_some()
|| filter.is_some()
|| *matching_strategy != MatchingStrategy::default()
|| attributes_to_search_on.is_some()
|| hybrid.is_some()
|| ranking_score_threshold.is_some()
|| locales.is_some(),
..Default::default()
}
}
pub fn succeed(&mut self, result: &FacetSearchResult) {
let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result;
self.total_succeeded = 1;
self.time_spent.push(*processing_time_ms as usize);
}
}
impl Aggregate for FacetSearchAggregator {
fn event_name(&self) -> &'static str {
"Facet Searched POST"
}
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
for time in new.time_spent {
self.time_spent.push(time);
}
Box::new(Self {
total_received: self.total_received.saturating_add(new.total_received),
total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded),
time_spent: self.time_spent,
facet_names: self.facet_names.union(&new.facet_names).cloned().collect(),
additional_search_parameters_provided: self.additional_search_parameters_provided
| new.additional_search_parameters_provided,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
let Self {
total_received,
total_succeeded,
time_spent,
facet_names,
additional_search_parameters_provided,
} = *self;
// the index of the 99th percentage of value
let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.;
// we get all the values in a sorted manner
let time_spent = time_spent.into_sorted_vec();
// We are only interested by the slowest value of the 99th fastest results
let time_spent = time_spent.get(percentile_99th as usize);
serde_json::json!({
"requests": {
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
"total_succeeded": total_succeeded,
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
"total_received": total_received,
},
"facets": {
"total_distinct_facet_count": facet_names.len(),
"additional_search_parameters_provided": additional_search_parameters_provided,
},
})
}
}
pub async fn search( pub async fn search(
index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>,
search_queue: Data<SearchQueue>, search_queue: Data<SearchQueue>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebJson<FacetSearchQuery, DeserrJsonError>, params: AwebJson<FacetSearchQuery, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let query = params.into_inner(); let query = params.into_inner();
debug!(parameters = ?query, "Facet search"); debug!(parameters = ?query, "Facet search");
let mut aggregate = FacetSearchAggregator::from_query(&query); let mut aggregate = FacetSearchAggregator::from_query(&query, &req);
let facet_query = query.facet_query.clone(); let facet_query = query.facet_query.clone();
let facet_name = query.facet_name.clone(); let facet_name = query.facet_name.clone();
@ -205,7 +100,7 @@ pub async fn search(
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
} }
analytics.publish(aggregate, &req); analytics.post_facet_search(aggregate);
let search_result = search_result?; let search_result = search_result?;

View File

@ -1,4 +1,3 @@
use std::collections::BTreeSet;
use std::convert::Infallible; use std::convert::Infallible;
use actix_web::web::Data; use actix_web::web::Data;
@ -14,11 +13,12 @@ use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::{self, FieldDistribution, Index}; use meilisearch_types::milli::{self, FieldDistribution, Index};
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
use serde::Serialize; use serde::Serialize;
use serde_json::json;
use time::OffsetDateTime; use time::OffsetDateTime;
use tracing::debug; use tracing::debug;
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
use crate::analytics::{Aggregate, Analytics}; use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
@ -28,11 +28,8 @@ use crate::Opt;
pub mod documents; pub mod documents;
pub mod facet_search; pub mod facet_search;
pub mod search; pub mod search;
mod search_analytics;
pub mod settings; pub mod settings;
mod settings_analytics;
pub mod similar; pub mod similar;
mod similar_analytics;
pub fn configure(cfg: &mut web::ServiceConfig) { pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service( cfg.service(
@ -126,31 +123,12 @@ pub struct IndexCreateRequest {
primary_key: Option<String>, primary_key: Option<String>,
} }
#[derive(Serialize)]
struct IndexCreatedAggregate {
primary_key: BTreeSet<String>,
}
impl Aggregate for IndexCreatedAggregate {
fn event_name(&self) -> &'static str {
"Index Created"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn create_index( pub async fn create_index(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
body: AwebJson<IndexCreateRequest, DeserrJsonError>, body: AwebJson<IndexCreateRequest, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Create index"); debug!(parameters = ?body, "Create index");
let IndexCreateRequest { primary_key, uid } = body.into_inner(); let IndexCreateRequest { primary_key, uid } = body.into_inner();
@ -158,8 +136,9 @@ pub async fn create_index(
let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid); let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid);
if allow_index_creation { if allow_index_creation {
analytics.publish( analytics.publish(
IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() }, "Index Created".to_string(),
&req, json!({ "primary_key": primary_key }),
Some(&req),
); );
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
@ -215,38 +194,21 @@ pub async fn get_index(
Ok(HttpResponse::Ok().json(index_view)) Ok(HttpResponse::Ok().json(index_view))
} }
#[derive(Serialize)]
struct IndexUpdatedAggregate {
primary_key: BTreeSet<String>,
}
impl Aggregate for IndexUpdatedAggregate {
fn event_name(&self) -> &'static str {
"Index Updated"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn update_index( pub async fn update_index(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_UPDATE }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_UPDATE }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
body: AwebJson<UpdateIndexRequest, DeserrJsonError>, body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Update index"); debug!(parameters = ?body, "Update index");
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let body = body.into_inner(); let body = body.into_inner();
analytics.publish( analytics.publish(
IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() }, "Index Updated".to_string(),
&req, json!({ "primary_key": body.primary_key }),
Some(&req),
); );
let task = KindWithContent::IndexUpdate { let task = KindWithContent::IndexUpdate {

View File

@ -13,13 +13,12 @@ use meilisearch_types::serde_cs::vec::CS;
use serde_json::Value; use serde_json::Value;
use tracing::debug; use tracing::debug;
use crate::analytics::Analytics; use crate::analytics::{Analytics, SearchAggregator};
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST};
use crate::search::{ use crate::search::{
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
@ -226,7 +225,7 @@ pub async fn search_with_url_query(
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebQueryParameter<SearchQueryGet, DeserrQueryParamError>, params: AwebQueryParameter<SearchQueryGet, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Search get"); debug!(parameters = ?params, "Search get");
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -238,7 +237,7 @@ pub async fn search_with_url_query(
add_search_rules(&mut query.filter, search_rules); add_search_rules(&mut query.filter, search_rules);
} }
let mut aggregate = SearchAggregator::<SearchGET>::from_query(&query); let mut aggregate = SearchAggregator::from_query(&query, &req);
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features(); let features = index_scheduler.features();
@ -255,7 +254,7 @@ pub async fn search_with_url_query(
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
} }
analytics.publish(aggregate, &req); analytics.get_search(aggregate);
let search_result = search_result?; let search_result = search_result?;
@ -269,7 +268,7 @@ pub async fn search_with_post(
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebJson<SearchQuery, DeserrJsonError>, params: AwebJson<SearchQuery, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -281,7 +280,7 @@ pub async fn search_with_post(
add_search_rules(&mut query.filter, search_rules); add_search_rules(&mut query.filter, search_rules);
} }
let mut aggregate = SearchAggregator::<SearchPOST>::from_query(&query); let mut aggregate = SearchAggregator::from_query(&query, &req);
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
@ -303,7 +302,7 @@ pub async fn search_with_post(
MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc();
} }
} }
analytics.publish(aggregate, &req); analytics.post_search(aggregate);
let search_result = search_result?; let search_result = search_result?;

View File

@ -1,483 +0,0 @@
use std::collections::{BTreeSet, BinaryHeap, HashMap};
use meilisearch_types::locales::Locale;
use once_cell::sync::Lazy;
use regex::Regex;
use serde_json::{json, Value};
use crate::aggregate_methods;
use crate::analytics::{Aggregate, AggregateMethod};
use crate::search::{
SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
DEFAULT_SEMANTIC_RATIO,
};
aggregate_methods!(
SearchGET => "Documents Searched GET",
SearchPOST => "Documents Searched POST",
);
#[derive(Default)]
pub struct SearchAggregator<Method: AggregateMethod> {
// requests
total_received: usize,
total_succeeded: usize,
total_degraded: usize,
total_used_negative_operator: usize,
time_spent: BinaryHeap<usize>,
// sort
sort_with_geo_point: bool,
// every time a request has a filter, this field must be incremented by the number of terms it contains
sort_sum_of_criteria_terms: usize,
// every time a request has a filter, this field must be incremented by one
sort_total_number_of_criteria: usize,
// distinct
distinct: bool,
// filter
filter_with_geo_radius: bool,
filter_with_geo_bounding_box: bool,
// every time a request has a filter, this field must be incremented by the number of terms it contains
filter_sum_of_criteria_terms: usize,
// every time a request has a filter, this field must be incremented by one
filter_total_number_of_criteria: usize,
used_syntax: HashMap<String, usize>,
// attributes_to_search_on
// every time a search is done using attributes_to_search_on
attributes_to_search_on_total_number_of_uses: usize,
// q
// The maximum number of terms in a q request
max_terms_number: usize,
// vector
// The maximum number of floats in a vector request
max_vector_size: usize,
// Whether the semantic ratio passed to a hybrid search equals the default ratio.
semantic_ratio: bool,
hybrid: bool,
retrieve_vectors: bool,
// every time a search is done, we increment the counter linked to the used settings
matching_strategy: HashMap<String, usize>,
// List of the unique Locales passed as parameter
locales: BTreeSet<Locale>,
// pagination
max_limit: usize,
max_offset: usize,
finite_pagination: usize,
// formatting
max_attributes_to_retrieve: usize,
max_attributes_to_highlight: usize,
highlight_pre_tag: bool,
highlight_post_tag: bool,
max_attributes_to_crop: usize,
crop_marker: bool,
show_matches_position: bool,
crop_length: bool,
// facets
facets_sum_of_terms: usize,
facets_total_number_of_facets: usize,
// scoring
show_ranking_score: bool,
show_ranking_score_details: bool,
ranking_score_threshold: bool,
marker: std::marker::PhantomData<Method>,
}
impl<Method: AggregateMethod> SearchAggregator<Method> {
#[allow(clippy::field_reassign_with_default)]
pub fn from_query(query: &SearchQuery) -> Self {
let SearchQuery {
q,
vector,
offset,
limit,
page,
hits_per_page,
attributes_to_retrieve: _,
retrieve_vectors,
attributes_to_crop: _,
crop_length,
attributes_to_highlight: _,
show_matches_position,
show_ranking_score,
show_ranking_score_details,
filter,
sort,
distinct,
facets: _,
highlight_pre_tag,
highlight_post_tag,
crop_marker,
matching_strategy,
attributes_to_search_on,
hybrid,
ranking_score_threshold,
locales,
} = query;
let mut ret = Self::default();
ret.total_received = 1;
if let Some(ref sort) = sort {
ret.sort_total_number_of_criteria = 1;
ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint("));
ret.sort_sum_of_criteria_terms = sort.len();
}
ret.distinct = distinct.is_some();
if let Some(ref filter) = filter {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
ret.filter_total_number_of_criteria = 1;
let syntax = match filter {
Value::String(_) => "string".to_string(),
Value::Array(values) => {
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
"mixed".to_string()
} else {
"array".to_string()
}
}
_ => "none".to_string(),
};
// convert the string to a HashMap
ret.used_syntax.insert(syntax, 1);
let stringified_filters = filter.to_string();
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
}
// attributes_to_search_on
if attributes_to_search_on.is_some() {
ret.attributes_to_search_on_total_number_of_uses = 1;
}
if let Some(ref q) = q {
ret.max_terms_number = q.split_whitespace().count();
}
if let Some(ref vector) = vector {
ret.max_vector_size = vector.len();
}
ret.retrieve_vectors |= retrieve_vectors;
if query.is_finite_pagination() {
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
ret.max_limit = limit;
ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit;
ret.finite_pagination = 1;
} else {
ret.max_limit = *limit;
ret.max_offset = *offset;
ret.finite_pagination = 0;
}
ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1);
if let Some(locales) = locales {
ret.locales = locales.iter().copied().collect();
}
ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG();
ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG();
ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER();
ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH();
ret.show_matches_position = *show_matches_position;
ret.show_ranking_score = *show_ranking_score;
ret.show_ranking_score_details = *show_ranking_score_details;
ret.ranking_score_threshold = ranking_score_threshold.is_some();
if let Some(hybrid) = hybrid {
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
ret.hybrid = true;
}
ret
}
pub fn succeed(&mut self, result: &SearchResult) {
let SearchResult {
hits: _,
query: _,
processing_time_ms,
hits_info: _,
semantic_hit_count: _,
facet_distribution: _,
facet_stats: _,
degraded,
used_negative_operator,
} = result;
self.total_succeeded = self.total_succeeded.saturating_add(1);
if *degraded {
self.total_degraded = self.total_degraded.saturating_add(1);
}
if *used_negative_operator {
self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1);
}
self.time_spent.push(*processing_time_ms as usize);
}
}
impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
fn event_name(&self) -> &'static str {
Method::event_name()
}
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
let Self {
total_received,
total_succeeded,
mut time_spent,
sort_with_geo_point,
sort_sum_of_criteria_terms,
sort_total_number_of_criteria,
distinct,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
filter_total_number_of_criteria,
used_syntax,
attributes_to_search_on_total_number_of_uses,
max_terms_number,
max_vector_size,
retrieve_vectors,
matching_strategy,
max_limit,
max_offset,
finite_pagination,
max_attributes_to_retrieve,
max_attributes_to_highlight,
highlight_pre_tag,
highlight_post_tag,
max_attributes_to_crop,
crop_marker,
show_matches_position,
crop_length,
facets_sum_of_terms,
facets_total_number_of_facets,
show_ranking_score,
show_ranking_score_details,
semantic_ratio,
hybrid,
total_degraded,
total_used_negative_operator,
ranking_score_threshold,
mut locales,
marker: _,
} = *new;
// request
self.total_received = self.total_received.saturating_add(total_received);
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
self.total_degraded = self.total_degraded.saturating_add(total_degraded);
self.total_used_negative_operator =
self.total_used_negative_operator.saturating_add(total_used_negative_operator);
self.time_spent.append(&mut time_spent);
// sort
self.sort_with_geo_point |= sort_with_geo_point;
self.sort_sum_of_criteria_terms =
self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms);
self.sort_total_number_of_criteria =
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
// distinct
self.distinct |= distinct;
// filter
self.filter_with_geo_radius |= filter_with_geo_radius;
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
self.filter_sum_of_criteria_terms =
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
self.filter_total_number_of_criteria =
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
for (key, value) in used_syntax.into_iter() {
let used_syntax = self.used_syntax.entry(key).or_insert(0);
*used_syntax = used_syntax.saturating_add(value);
}
// attributes_to_search_on
self.attributes_to_search_on_total_number_of_uses = self
.attributes_to_search_on_total_number_of_uses
.saturating_add(attributes_to_search_on_total_number_of_uses);
// q
self.max_terms_number = self.max_terms_number.max(max_terms_number);
// vector
self.max_vector_size = self.max_vector_size.max(max_vector_size);
self.retrieve_vectors |= retrieve_vectors;
self.semantic_ratio |= semantic_ratio;
self.hybrid |= hybrid;
// pagination
self.max_limit = self.max_limit.max(max_limit);
self.max_offset = self.max_offset.max(max_offset);
self.finite_pagination += finite_pagination;
// formatting
self.max_attributes_to_retrieve =
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
self.max_attributes_to_highlight =
self.max_attributes_to_highlight.max(max_attributes_to_highlight);
self.highlight_pre_tag |= highlight_pre_tag;
self.highlight_post_tag |= highlight_post_tag;
self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop);
self.crop_marker |= crop_marker;
self.show_matches_position |= show_matches_position;
self.crop_length |= crop_length;
// facets
self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms);
self.facets_total_number_of_facets =
self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets);
// matching strategy
for (key, value) in matching_strategy.into_iter() {
let matching_strategy = self.matching_strategy.entry(key).or_insert(0);
*matching_strategy = matching_strategy.saturating_add(value);
}
// scoring
self.show_ranking_score |= show_ranking_score;
self.show_ranking_score_details |= show_ranking_score_details;
self.ranking_score_threshold |= ranking_score_threshold;
// locales
self.locales.append(&mut locales);
self
}
fn into_event(self: Box<Self>) -> serde_json::Value {
let Self {
total_received,
total_succeeded,
time_spent,
sort_with_geo_point,
sort_sum_of_criteria_terms,
sort_total_number_of_criteria,
distinct,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
filter_total_number_of_criteria,
used_syntax,
attributes_to_search_on_total_number_of_uses,
max_terms_number,
max_vector_size,
retrieve_vectors,
matching_strategy,
max_limit,
max_offset,
finite_pagination,
max_attributes_to_retrieve,
max_attributes_to_highlight,
highlight_pre_tag,
highlight_post_tag,
max_attributes_to_crop,
crop_marker,
show_matches_position,
crop_length,
facets_sum_of_terms,
facets_total_number_of_facets,
show_ranking_score,
show_ranking_score_details,
semantic_ratio,
hybrid,
total_degraded,
total_used_negative_operator,
ranking_score_threshold,
locales,
marker: _,
} = *self;
// we get all the values in a sorted manner
let time_spent = time_spent.into_sorted_vec();
// the index of the 99th percentage of value
let percentile_99th = time_spent.len() * 99 / 100;
// We are only interested by the slowest value of the 99th fastest results
let time_spent = time_spent.get(percentile_99th);
json!({
"requests": {
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
"total_succeeded": total_succeeded,
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
"total_received": total_received,
"total_degraded": total_degraded,
"total_used_negative_operator": total_used_negative_operator,
},
"sort": {
"with_geoPoint": sort_with_geo_point,
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
},
"distinct": distinct,
"filter": {
"with_geoRadius": filter_with_geo_radius,
"with_geoBoundingBox": filter_with_geo_bounding_box,
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
},
"attributes_to_search_on": {
"total_number_of_uses": attributes_to_search_on_total_number_of_uses,
},
"q": {
"max_terms_number": max_terms_number,
},
"vector": {
"max_vector_size": max_vector_size,
"retrieve_vectors": retrieve_vectors,
},
"hybrid": {
"enabled": hybrid,
"semantic_ratio": semantic_ratio,
},
"pagination": {
"max_limit": max_limit,
"max_offset": max_offset,
"most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" },
},
"formatting": {
"max_attributes_to_retrieve": max_attributes_to_retrieve,
"max_attributes_to_highlight": max_attributes_to_highlight,
"highlight_pre_tag": highlight_pre_tag,
"highlight_post_tag": highlight_post_tag,
"max_attributes_to_crop": max_attributes_to_crop,
"crop_marker": crop_marker,
"show_matches_position": show_matches_position,
"crop_length": crop_length,
},
"facets": {
"avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64),
},
"matching_strategy": {
"most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
},
"locales": locales,
"scoring": {
"show_ranking_score": show_ranking_score,
"show_ranking_score_details": show_ranking_score_details,
"ranking_score_threshold": ranking_score_threshold,
},
})
}
}

View File

@ -4,13 +4,14 @@ use deserr::actix_web::AwebJson;
use index_scheduler::IndexScheduler; use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::facet_values_sort::FacetValuesSort;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::update::Setting;
use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked};
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
use serde_json::json;
use tracing::debug; use tracing::debug;
use super::settings_analytics::*;
use crate::analytics::Analytics; use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
@ -19,7 +20,7 @@ use crate::Opt;
#[macro_export] #[macro_export]
macro_rules! make_setting_route { macro_rules! make_setting_route {
($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => {
pub mod $attr { pub mod $attr {
use actix_web::web::Data; use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse, Resource}; use actix_web::{web, HttpRequest, HttpResponse, Resource};
@ -79,7 +80,7 @@ macro_rules! make_setting_route {
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>, body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, $analytics_var: web::Data<dyn Analytics>,
) -> std::result::Result<HttpResponse, ResponseError> { ) -> std::result::Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -87,10 +88,7 @@ macro_rules! make_setting_route {
debug!(parameters = ?body, "Update settings"); debug!(parameters = ?body, "Update settings");
#[allow(clippy::redundant_closure_call)] #[allow(clippy::redundant_closure_call)]
analytics.publish( $analytics(&body, &req);
$crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(),
&req,
);
let new_settings = Settings { let new_settings = Settings {
$attr: match body { $attr: match body {
@ -162,7 +160,21 @@ make_setting_route!(
>, >,
filterable_attributes, filterable_attributes,
"filterableAttributes", "filterableAttributes",
FilterableAttributesAnalytics analytics,
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"FilterableAttributes Updated".to_string(),
json!({
"filterable_attributes": {
"total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0),
"has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false),
}
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -174,7 +186,21 @@ make_setting_route!(
>, >,
sortable_attributes, sortable_attributes,
"sortableAttributes", "sortableAttributes",
SortableAttributesAnalytics analytics,
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"SortableAttributes Updated".to_string(),
json!({
"sortable_attributes": {
"total": setting.as_ref().map(|sort| sort.len()),
"has_geo": setting.as_ref().map(|sort| sort.contains("_geo")),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -186,7 +212,21 @@ make_setting_route!(
>, >,
displayed_attributes, displayed_attributes,
"displayedAttributes", "displayedAttributes",
DisplayedAttributesAnalytics analytics,
|displayed: &Option<Vec<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"DisplayedAttributes Updated".to_string(),
json!({
"displayed_attributes": {
"total": displayed.as_ref().map(|displayed| displayed.len()),
"with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -198,7 +238,40 @@ make_setting_route!(
>, >,
typo_tolerance, typo_tolerance,
"typoTolerance", "typoTolerance",
TypoToleranceAnalytics analytics,
|setting: &Option<meilisearch_types::settings::TypoSettings>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"TypoTolerance Updated".to_string(),
json!({
"typo_tolerance": {
"enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
"disable_on_attributes": setting
.as_ref()
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
"disable_on_words": setting
.as_ref()
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
"min_word_size_for_one_typo": setting
.as_ref()
.and_then(|s| s.min_word_size_for_typos
.as_ref()
.set()
.map(|s| s.one_typo.set()))
.flatten(),
"min_word_size_for_two_typos": setting
.as_ref()
.and_then(|s| s.min_word_size_for_typos
.as_ref()
.set()
.map(|s| s.two_typos.set()))
.flatten(),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -210,7 +283,21 @@ make_setting_route!(
>, >,
searchable_attributes, searchable_attributes,
"searchableAttributes", "searchableAttributes",
SearchableAttributesAnalytics analytics,
|setting: &Option<Vec<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"SearchableAttributes Updated".to_string(),
json!({
"searchable_attributes": {
"total": setting.as_ref().map(|searchable| searchable.len()),
"with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -222,7 +309,20 @@ make_setting_route!(
>, >,
stop_words, stop_words,
"stopWords", "stopWords",
StopWordsAnalytics analytics,
|stop_words: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"StopWords Updated".to_string(),
json!({
"stop_words": {
"total": stop_words.as_ref().map(|stop_words| stop_words.len()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -234,7 +334,20 @@ make_setting_route!(
>, >,
non_separator_tokens, non_separator_tokens,
"nonSeparatorTokens", "nonSeparatorTokens",
NonSeparatorTokensAnalytics analytics,
|non_separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"nonSeparatorTokens Updated".to_string(),
json!({
"non_separator_tokens": {
"total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -246,7 +359,20 @@ make_setting_route!(
>, >,
separator_tokens, separator_tokens,
"separatorTokens", "separatorTokens",
SeparatorTokensAnalytics analytics,
|separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"separatorTokens Updated".to_string(),
json!({
"separator_tokens": {
"total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -258,7 +384,20 @@ make_setting_route!(
>, >,
dictionary, dictionary,
"dictionary", "dictionary",
DictionaryAnalytics analytics,
|dictionary: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"dictionary Updated".to_string(),
json!({
"dictionary": {
"total": dictionary.as_ref().map(|dictionary| dictionary.len()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -270,7 +409,20 @@ make_setting_route!(
>, >,
synonyms, synonyms,
"synonyms", "synonyms",
SynonymsAnalytics analytics,
|synonyms: &Option<std::collections::BTreeMap<String, Vec<String>>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"Synonyms Updated".to_string(),
json!({
"synonyms": {
"total": synonyms.as_ref().map(|synonyms| synonyms.len()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -282,7 +434,19 @@ make_setting_route!(
>, >,
distinct_attribute, distinct_attribute,
"distinctAttribute", "distinctAttribute",
DistinctAttributeAnalytics analytics,
|distinct: &Option<String>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"DistinctAttribute Updated".to_string(),
json!({
"distinct_attribute": {
"set": distinct.is_some(),
}
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -294,7 +458,20 @@ make_setting_route!(
>, >,
proximity_precision, proximity_precision,
"proximityPrecision", "proximityPrecision",
ProximityPrecisionAnalytics analytics,
|precision: &Option<meilisearch_types::settings::ProximityPrecisionView>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"ProximityPrecision Updated".to_string(),
json!({
"proximity_precision": {
"set": precision.is_some(),
"value": precision.unwrap_or_default(),
}
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -306,7 +483,17 @@ make_setting_route!(
>, >,
localized_attributes, localized_attributes,
"localizedAttributes", "localizedAttributes",
LocalesAnalytics analytics,
|rules: &Option<Vec<meilisearch_types::locales::LocalizedAttributesRuleView>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"LocalizedAttributesRules Updated".to_string(),
json!({
"locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>())
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -318,7 +505,26 @@ make_setting_route!(
>, >,
ranking_rules, ranking_rules,
"rankingRules", "rankingRules",
RankingRulesAnalytics analytics,
|setting: &Option<Vec<meilisearch_types::settings::RankingRuleView>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"RankingRules Updated".to_string(),
json!({
"ranking_rules": {
"words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))),
"typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))),
"proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))),
"attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))),
"sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))),
"exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))),
"values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
}
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -330,7 +536,25 @@ make_setting_route!(
>, >,
faceting, faceting,
"faceting", "faceting",
FacetingAnalytics analytics,
|setting: &Option<meilisearch_types::settings::FacetingSettings>, req: &HttpRequest| {
use serde_json::json;
use meilisearch_types::facet_values_sort::FacetValuesSort;
analytics.publish(
"Faceting Updated".to_string(),
json!({
"faceting": {
"max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
"sort_facet_values_by_star_count": setting.as_ref().and_then(|s| {
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
}),
"sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -342,7 +566,20 @@ make_setting_route!(
>, >,
pagination, pagination,
"pagination", "pagination",
PaginationAnalytics analytics,
|setting: &Option<meilisearch_types::settings::PaginationSettings>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"Pagination Updated".to_string(),
json!({
"pagination": {
"max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()),
},
}),
Some(req),
);
}
); );
make_setting_route!( make_setting_route!(
@ -354,8 +591,74 @@ make_setting_route!(
>, >,
embedders, embedders,
"embedders", "embedders",
EmbeddersAnalytics analytics,
|setting: &Option<std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>>, req: &HttpRequest| {
analytics.publish(
"Embedders Updated".to_string(),
serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}),
Some(req),
); );
}
);
fn embedder_analytics(
setting: Option<
&std::collections::BTreeMap<
String,
Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>,
>,
>,
) -> serde_json::Value {
let mut sources = std::collections::HashSet::new();
if let Some(s) = &setting {
for source in s
.values()
.filter_map(|config| config.clone().set())
.filter_map(|config| config.source.set())
{
use meilisearch_types::milli::vector::settings::EmbedderSource;
match source {
EmbedderSource::OpenAi => sources.insert("openAi"),
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
EmbedderSource::UserProvided => sources.insert("userProvided"),
EmbedderSource::Ollama => sources.insert("ollama"),
EmbedderSource::Rest => sources.insert("rest"),
};
}
};
let document_template_used = setting.as_ref().map(|map| {
map.values()
.filter_map(|config| config.clone().set())
.any(|config| config.document_template.set().is_some())
});
let document_template_max_bytes = setting.as_ref().and_then(|map| {
map.values()
.filter_map(|config| config.clone().set())
.filter_map(|config| config.document_template_max_bytes.set())
.max()
});
let binary_quantization_used = setting.as_ref().map(|map| {
map.values()
.filter_map(|config| config.clone().set())
.any(|config| config.binary_quantized.set().is_some())
});
json!(
{
"total": setting.as_ref().map(|s| s.len()),
"sources": sources,
"document_template_used": document_template_used,
"document_template_max_bytes": document_template_max_bytes,
"binary_quantization_used": binary_quantization_used,
}
)
}
make_setting_route!( make_setting_route!(
"/search-cutoff-ms", "/search-cutoff-ms",
@ -366,7 +669,14 @@ make_setting_route!(
>, >,
search_cutoff_ms, search_cutoff_ms,
"searchCutoffMs", "searchCutoffMs",
SearchCutoffMsAnalytics analytics,
|setting: &Option<u64>, req: &HttpRequest| {
analytics.publish(
"Search Cutoff Updated".to_string(),
serde_json::json!({"search_cutoff_ms": setting }),
Some(req),
);
}
); );
macro_rules! generate_configure { macro_rules! generate_configure {
@ -410,7 +720,7 @@ pub async fn update_all(
body: AwebJson<Settings<Unchecked>, DeserrJsonError>, body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -419,45 +729,104 @@ pub async fn update_all(
let new_settings = validate_settings(new_settings, &index_scheduler)?; let new_settings = validate_settings(new_settings, &index_scheduler)?;
analytics.publish( analytics.publish(
SettingsAnalytics { "Settings Updated".to_string(),
ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()), json!({
searchable_attributes: SearchableAttributesAnalytics::new( "ranking_rules": {
new_settings.searchable_attributes.as_ref().set(), "words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))),
), "typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))),
displayed_attributes: DisplayedAttributesAnalytics::new( "proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))),
new_settings.displayed_attributes.as_ref().set(), "attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))),
), "sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))),
sortable_attributes: SortableAttributesAnalytics::new( "exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))),
new_settings.sortable_attributes.as_ref().set(), "values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
),
filterable_attributes: FilterableAttributesAnalytics::new(
new_settings.filterable_attributes.as_ref().set(),
),
distinct_attribute: DistinctAttributeAnalytics::new(
new_settings.distinct_attribute.as_ref().set(),
),
proximity_precision: ProximityPrecisionAnalytics::new(
new_settings.proximity_precision.as_ref().set(),
),
typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()),
faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()),
pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()),
stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()),
synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()),
embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()),
search_cutoff_ms: SearchCutoffMsAnalytics::new(
new_settings.search_cutoff_ms.as_ref().set(),
),
locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()),
dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()),
separator_tokens: SeparatorTokensAnalytics::new(
new_settings.separator_tokens.as_ref().set(),
),
non_separator_tokens: NonSeparatorTokensAnalytics::new(
new_settings.non_separator_tokens.as_ref().set(),
),
}, },
&req, "searchable_attributes": {
"total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()),
"with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
},
"displayed_attributes": {
"total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()),
"with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
},
"sortable_attributes": {
"total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()),
"has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")),
},
"filterable_attributes": {
"total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()),
"has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")),
},
"distinct_attribute": {
"set": new_settings.distinct_attribute.as_ref().set().is_some()
},
"proximity_precision": {
"set": new_settings.proximity_precision.as_ref().set().is_some(),
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
},
"typo_tolerance": {
"enabled": new_settings.typo_tolerance
.as_ref()
.set()
.and_then(|s| s.enabled.as_ref().set())
.copied(),
"disable_on_attributes": new_settings.typo_tolerance
.as_ref()
.set()
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
"disable_on_words": new_settings.typo_tolerance
.as_ref()
.set()
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
"min_word_size_for_one_typo": new_settings.typo_tolerance
.as_ref()
.set()
.and_then(|s| s.min_word_size_for_typos
.as_ref()
.set()
.map(|s| s.one_typo.set()))
.flatten(),
"min_word_size_for_two_typos": new_settings.typo_tolerance
.as_ref()
.set()
.and_then(|s| s.min_word_size_for_typos
.as_ref()
.set()
.map(|s| s.two_typos.set()))
.flatten(),
},
"faceting": {
"max_values_per_facet": new_settings.faceting
.as_ref()
.set()
.and_then(|s| s.max_values_per_facet.as_ref().set()),
"sort_facet_values_by_star_count": new_settings.faceting
.as_ref()
.set()
.and_then(|s| {
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
}),
"sort_facet_values_by_total": new_settings.faceting
.as_ref()
.set()
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
},
"pagination": {
"max_total_hits": new_settings.pagination
.as_ref()
.set()
.and_then(|s| s.max_total_hits.as_ref().set()),
},
"stop_words": {
"total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()),
},
"synonyms": {
"total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()),
},
"embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()),
"search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(),
"locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>()),
}),
Some(&req),
); );
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);

View File

@ -1,622 +0,0 @@
//! All the structures used to make the analytics on the settings works.
//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received
//! through the sub-settings route directly without any manipulation.
//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`.
use std::collections::{BTreeMap, BTreeSet, HashSet};
use meilisearch_types::facet_values_sort::FacetValuesSort;
use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView};
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
use meilisearch_types::settings::{
FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings,
};
use serde::Serialize;
use crate::analytics::Aggregate;
#[derive(Serialize, Default)]
pub struct SettingsAnalytics {
pub ranking_rules: RankingRulesAnalytics,
pub searchable_attributes: SearchableAttributesAnalytics,
pub displayed_attributes: DisplayedAttributesAnalytics,
pub sortable_attributes: SortableAttributesAnalytics,
pub filterable_attributes: FilterableAttributesAnalytics,
pub distinct_attribute: DistinctAttributeAnalytics,
pub proximity_precision: ProximityPrecisionAnalytics,
pub typo_tolerance: TypoToleranceAnalytics,
pub faceting: FacetingAnalytics,
pub pagination: PaginationAnalytics,
pub stop_words: StopWordsAnalytics,
pub synonyms: SynonymsAnalytics,
pub embedders: EmbeddersAnalytics,
pub search_cutoff_ms: SearchCutoffMsAnalytics,
pub locales: LocalesAnalytics,
pub dictionary: DictionaryAnalytics,
pub separator_tokens: SeparatorTokensAnalytics,
pub non_separator_tokens: NonSeparatorTokensAnalytics,
}
impl Aggregate for SettingsAnalytics {
fn event_name(&self) -> &'static str {
"Settings Updated"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
ranking_rules: RankingRulesAnalytics {
words_position: new
.ranking_rules
.words_position
.or(self.ranking_rules.words_position),
typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position),
proximity_position: new
.ranking_rules
.proximity_position
.or(self.ranking_rules.proximity_position),
attribute_position: new
.ranking_rules
.attribute_position
.or(self.ranking_rules.attribute_position),
sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position),
exactness_position: new
.ranking_rules
.exactness_position
.or(self.ranking_rules.exactness_position),
values: new.ranking_rules.values.or(self.ranking_rules.values),
},
searchable_attributes: SearchableAttributesAnalytics {
total: new.searchable_attributes.total.or(self.searchable_attributes.total),
with_wildcard: new
.searchable_attributes
.with_wildcard
.or(self.searchable_attributes.with_wildcard),
},
displayed_attributes: DisplayedAttributesAnalytics {
total: new.displayed_attributes.total.or(self.displayed_attributes.total),
with_wildcard: new
.displayed_attributes
.with_wildcard
.or(self.displayed_attributes.with_wildcard),
},
sortable_attributes: SortableAttributesAnalytics {
total: new.sortable_attributes.total.or(self.sortable_attributes.total),
has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo),
},
filterable_attributes: FilterableAttributesAnalytics {
total: new.filterable_attributes.total.or(self.filterable_attributes.total),
has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo),
},
distinct_attribute: DistinctAttributeAnalytics {
set: self.distinct_attribute.set | new.distinct_attribute.set,
},
proximity_precision: ProximityPrecisionAnalytics {
set: self.proximity_precision.set | new.proximity_precision.set,
value: new.proximity_precision.value.or(self.proximity_precision.value),
},
typo_tolerance: TypoToleranceAnalytics {
enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled),
disable_on_attributes: new
.typo_tolerance
.disable_on_attributes
.or(self.typo_tolerance.disable_on_attributes),
disable_on_words: new
.typo_tolerance
.disable_on_words
.or(self.typo_tolerance.disable_on_words),
min_word_size_for_one_typo: new
.typo_tolerance
.min_word_size_for_one_typo
.or(self.typo_tolerance.min_word_size_for_one_typo),
min_word_size_for_two_typos: new
.typo_tolerance
.min_word_size_for_two_typos
.or(self.typo_tolerance.min_word_size_for_two_typos),
},
faceting: FacetingAnalytics {
max_values_per_facet: new
.faceting
.max_values_per_facet
.or(self.faceting.max_values_per_facet),
sort_facet_values_by_star_count: new
.faceting
.sort_facet_values_by_star_count
.or(self.faceting.sort_facet_values_by_star_count),
sort_facet_values_by_total: new
.faceting
.sort_facet_values_by_total
.or(self.faceting.sort_facet_values_by_total),
},
pagination: PaginationAnalytics {
max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits),
},
stop_words: StopWordsAnalytics {
total: new.stop_words.total.or(self.stop_words.total),
},
synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) },
embedders: EmbeddersAnalytics {
total: new.embedders.total.or(self.embedders.total),
sources: match (self.embedders.sources, new.embedders.sources) {
(None, None) => None,
(Some(sources), None) | (None, Some(sources)) => Some(sources),
(Some(this), Some(other)) => Some(this.union(&other).cloned().collect()),
},
document_template_used: match (
self.embedders.document_template_used,
new.embedders.document_template_used,
) {
(None, None) => None,
(Some(used), None) | (None, Some(used)) => Some(used),
(Some(this), Some(other)) => Some(this | other),
},
document_template_max_bytes: match (
self.embedders.document_template_max_bytes,
new.embedders.document_template_max_bytes,
) {
(None, None) => None,
(Some(bytes), None) | (None, Some(bytes)) => Some(bytes),
(Some(this), Some(other)) => Some(this.max(other)),
},
binary_quantization_used: match (
self.embedders.binary_quantization_used,
new.embedders.binary_quantization_used,
) {
(None, None) => None,
(Some(bq), None) | (None, Some(bq)) => Some(bq),
(Some(this), Some(other)) => Some(this | other),
},
},
search_cutoff_ms: SearchCutoffMsAnalytics {
search_cutoff_ms: new
.search_cutoff_ms
.search_cutoff_ms
.or(self.search_cutoff_ms.search_cutoff_ms),
},
locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) },
dictionary: DictionaryAnalytics {
total: new.dictionary.total.or(self.dictionary.total),
},
separator_tokens: SeparatorTokensAnalytics {
total: new.non_separator_tokens.total.or(self.separator_tokens.total),
},
non_separator_tokens: NonSeparatorTokensAnalytics {
total: new.non_separator_tokens.total.or(self.non_separator_tokens.total),
},
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
#[derive(Serialize, Default)]
pub struct RankingRulesAnalytics {
pub words_position: Option<usize>,
pub typo_position: Option<usize>,
pub proximity_position: Option<usize>,
pub attribute_position: Option<usize>,
pub sort_position: Option<usize>,
pub exactness_position: Option<usize>,
pub values: Option<String>,
}
impl RankingRulesAnalytics {
pub fn new(rr: Option<&Vec<RankingRuleView>>) -> Self {
RankingRulesAnalytics {
words_position: rr.as_ref().and_then(|rr| {
rr.iter()
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))
}),
typo_position: rr.as_ref().and_then(|rr| {
rr.iter()
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))
}),
proximity_position: rr.as_ref().and_then(|rr| {
rr.iter().position(|s| {
matches!(s, meilisearch_types::settings::RankingRuleView::Proximity)
})
}),
attribute_position: rr.as_ref().and_then(|rr| {
rr.iter().position(|s| {
matches!(s, meilisearch_types::settings::RankingRuleView::Attribute)
})
}),
sort_position: rr.as_ref().and_then(|rr| {
rr.iter()
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))
}),
exactness_position: rr.as_ref().and_then(|rr| {
rr.iter().position(|s| {
matches!(s, meilisearch_types::settings::RankingRuleView::Exactness)
})
}),
values: rr.as_ref().map(|rr| {
rr.iter()
.filter(|s| {
matches!(
s,
meilisearch_types::settings::RankingRuleView::Asc(_)
| meilisearch_types::settings::RankingRuleView::Desc(_)
)
})
.map(|x| x.to_string())
.collect::<Vec<_>>()
.join(", ")
}),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { ranking_rules: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct SearchableAttributesAnalytics {
pub total: Option<usize>,
pub with_wildcard: Option<bool>,
}
impl SearchableAttributesAnalytics {
pub fn new(setting: Option<&Vec<String>>) -> Self {
Self {
total: setting.as_ref().map(|searchable| searchable.len()),
with_wildcard: setting
.as_ref()
.map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { searchable_attributes: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct DisplayedAttributesAnalytics {
pub total: Option<usize>,
pub with_wildcard: Option<bool>,
}
impl DisplayedAttributesAnalytics {
pub fn new(displayed: Option<&Vec<String>>) -> Self {
Self {
total: displayed.as_ref().map(|displayed| displayed.len()),
with_wildcard: displayed
.as_ref()
.map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { displayed_attributes: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct SortableAttributesAnalytics {
pub total: Option<usize>,
pub has_geo: Option<bool>,
}
impl SortableAttributesAnalytics {
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
Self {
total: setting.as_ref().map(|sort| sort.len()),
has_geo: setting.as_ref().map(|sort| sort.contains("_geo")),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { sortable_attributes: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct FilterableAttributesAnalytics {
pub total: Option<usize>,
pub has_geo: Option<bool>,
}
impl FilterableAttributesAnalytics {
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
Self {
total: setting.as_ref().map(|filter| filter.len()),
has_geo: setting.as_ref().map(|filter| filter.contains("_geo")),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { filterable_attributes: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct DistinctAttributeAnalytics {
pub set: bool,
}
impl DistinctAttributeAnalytics {
pub fn new(distinct: Option<&String>) -> Self {
Self { set: distinct.is_some() }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { distinct_attribute: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct ProximityPrecisionAnalytics {
pub set: bool,
pub value: Option<ProximityPrecisionView>,
}
impl ProximityPrecisionAnalytics {
pub fn new(precision: Option<&ProximityPrecisionView>) -> Self {
Self { set: precision.is_some(), value: precision.cloned() }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { proximity_precision: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct TypoToleranceAnalytics {
pub enabled: Option<bool>,
pub disable_on_attributes: Option<bool>,
pub disable_on_words: Option<bool>,
pub min_word_size_for_one_typo: Option<u8>,
pub min_word_size_for_two_typos: Option<u8>,
}
impl TypoToleranceAnalytics {
pub fn new(setting: Option<&TypoSettings>) -> Self {
Self {
enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
disable_on_attributes: setting
.as_ref()
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
disable_on_words: setting
.as_ref()
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
min_word_size_for_one_typo: setting
.as_ref()
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set()))
.flatten(),
min_word_size_for_two_typos: setting
.as_ref()
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set()))
.flatten(),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { typo_tolerance: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct FacetingAnalytics {
pub max_values_per_facet: Option<usize>,
pub sort_facet_values_by_star_count: Option<bool>,
pub sort_facet_values_by_total: Option<usize>,
}
impl FacetingAnalytics {
pub fn new(setting: Option<&FacetingSettings>) -> Self {
Self {
max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
sort_facet_values_by_star_count: setting.as_ref().and_then(|s| {
s.sort_facet_values_by
.as_ref()
.set()
.map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
}),
sort_facet_values_by_total: setting
.as_ref()
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { faceting: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct PaginationAnalytics {
pub max_total_hits: Option<usize>,
}
impl PaginationAnalytics {
pub fn new(setting: Option<&PaginationSettings>) -> Self {
Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { pagination: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct StopWordsAnalytics {
pub total: Option<usize>,
}
impl StopWordsAnalytics {
pub fn new(stop_words: Option<&BTreeSet<String>>) -> Self {
Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { stop_words: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct SynonymsAnalytics {
pub total: Option<usize>,
}
impl SynonymsAnalytics {
pub fn new(synonyms: Option<&BTreeMap<String, Vec<String>>>) -> Self {
Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { synonyms: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct EmbeddersAnalytics {
// last
pub total: Option<usize>,
// Merge the sources
pub sources: Option<HashSet<String>>,
// |=
pub document_template_used: Option<bool>,
// max
pub document_template_max_bytes: Option<usize>,
// |=
pub binary_quantization_used: Option<bool>,
}
impl EmbeddersAnalytics {
pub fn new(setting: Option<&BTreeMap<String, Setting<EmbeddingSettings>>>) -> Self {
let mut sources = std::collections::HashSet::new();
if let Some(s) = &setting {
for source in s
.values()
.filter_map(|config| config.clone().set())
.filter_map(|config| config.source.set())
{
use meilisearch_types::milli::vector::settings::EmbedderSource;
match source {
EmbedderSource::OpenAi => sources.insert("openAi".to_string()),
EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()),
EmbedderSource::UserProvided => sources.insert("userProvided".to_string()),
EmbedderSource::Ollama => sources.insert("ollama".to_string()),
EmbedderSource::Rest => sources.insert("rest".to_string()),
};
}
};
Self {
total: setting.as_ref().map(|s| s.len()),
sources: Some(sources),
document_template_used: setting.as_ref().map(|map| {
map.values()
.filter_map(|config| config.clone().set())
.any(|config| config.document_template.set().is_some())
}),
document_template_max_bytes: setting.as_ref().and_then(|map| {
map.values()
.filter_map(|config| config.clone().set())
.filter_map(|config| config.document_template_max_bytes.set())
.max()
}),
binary_quantization_used: setting.as_ref().map(|map| {
map.values()
.filter_map(|config| config.clone().set())
.any(|config| config.binary_quantized.set().is_some())
}),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { embedders: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
#[serde(transparent)]
pub struct SearchCutoffMsAnalytics {
pub search_cutoff_ms: Option<u64>,
}
impl SearchCutoffMsAnalytics {
pub fn new(setting: Option<&u64>) -> Self {
Self { search_cutoff_ms: setting.copied() }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { search_cutoff_ms: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
#[serde(transparent)]
pub struct LocalesAnalytics {
pub locales: Option<BTreeSet<Locale>>,
}
impl LocalesAnalytics {
pub fn new(rules: Option<&Vec<LocalizedAttributesRuleView>>) -> Self {
LocalesAnalytics {
locales: rules.as_ref().map(|rules| {
rules
.iter()
.flat_map(|rule| rule.locales.iter().cloned())
.collect::<std::collections::BTreeSet<_>>()
}),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { locales: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct DictionaryAnalytics {
pub total: Option<usize>,
}
impl DictionaryAnalytics {
pub fn new(dictionary: Option<&BTreeSet<String>>) -> Self {
Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { dictionary: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct SeparatorTokensAnalytics {
pub total: Option<usize>,
}
impl SeparatorTokensAnalytics {
pub fn new(separator_tokens: Option<&BTreeSet<String>>) -> Self {
Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) }
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { separator_tokens: self, ..Default::default() }
}
}
#[derive(Serialize, Default)]
pub struct NonSeparatorTokensAnalytics {
pub total: Option<usize>,
}
impl NonSeparatorTokensAnalytics {
pub fn new(non_separator_tokens: Option<&BTreeSet<String>>) -> Self {
Self {
total: non_separator_tokens
.as_ref()
.map(|non_separator_tokens| non_separator_tokens.len()),
}
}
pub fn into_settings(self) -> SettingsAnalytics {
SettingsAnalytics { non_separator_tokens: self, ..Default::default() }
}
}

View File

@ -13,10 +13,9 @@ use serde_json::Value;
use tracing::debug; use tracing::debug;
use super::ActionPolicy; use super::ActionPolicy;
use crate::analytics::Analytics; use crate::analytics::{Analytics, SimilarAggregator};
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST};
use crate::search::{ use crate::search::{
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
@ -35,13 +34,13 @@ pub async fn similar_get(
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebQueryParameter<SimilarQueryGet, DeserrQueryParamError>, params: AwebQueryParameter<SimilarQueryGet, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let query = params.0.try_into()?; let query = params.0.try_into()?;
let mut aggregate = SimilarAggregator::<SimilarGET>::from_query(&query); let mut aggregate = SimilarAggregator::from_query(&query, &req);
debug!(parameters = ?query, "Similar get"); debug!(parameters = ?query, "Similar get");
@ -50,7 +49,7 @@ pub async fn similar_get(
if let Ok(similar) = &similar { if let Ok(similar) = &similar {
aggregate.succeed(similar); aggregate.succeed(similar);
} }
analytics.publish(aggregate, &req); analytics.get_similar(aggregate);
let similar = similar?; let similar = similar?;
@ -63,21 +62,21 @@ pub async fn similar_post(
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebJson<SimilarQuery, DeserrJsonError>, params: AwebJson<SimilarQuery, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let query = params.into_inner(); let query = params.into_inner();
debug!(parameters = ?query, "Similar post"); debug!(parameters = ?query, "Similar post");
let mut aggregate = SimilarAggregator::<SimilarPOST>::from_query(&query); let mut aggregate = SimilarAggregator::from_query(&query, &req);
let similar = similar(index_scheduler, index_uid, query).await; let similar = similar(index_scheduler, index_uid, query).await;
if let Ok(similar) = &similar { if let Ok(similar) = &similar {
aggregate.succeed(similar); aggregate.succeed(similar);
} }
analytics.publish(aggregate, &req); analytics.post_similar(aggregate);
let similar = similar?; let similar = similar?;

View File

@ -1,233 +0,0 @@
use std::collections::{BinaryHeap, HashMap};
use once_cell::sync::Lazy;
use regex::Regex;
use serde_json::{json, Value};
use crate::aggregate_methods;
use crate::analytics::{Aggregate, AggregateMethod};
use crate::search::{SimilarQuery, SimilarResult};
aggregate_methods!(
SimilarPOST => "Similar POST",
SimilarGET => "Similar GET",
);
#[derive(Default)]
pub struct SimilarAggregator<Method: AggregateMethod> {
// requests
total_received: usize,
total_succeeded: usize,
time_spent: BinaryHeap<usize>,
// filter
filter_with_geo_radius: bool,
filter_with_geo_bounding_box: bool,
// every time a request has a filter, this field must be incremented by the number of terms it contains
filter_sum_of_criteria_terms: usize,
// every time a request has a filter, this field must be incremented by one
filter_total_number_of_criteria: usize,
used_syntax: HashMap<String, usize>,
// Whether a non-default embedder was specified
retrieve_vectors: bool,
// pagination
max_limit: usize,
max_offset: usize,
// formatting
max_attributes_to_retrieve: usize,
// scoring
show_ranking_score: bool,
show_ranking_score_details: bool,
ranking_score_threshold: bool,
marker: std::marker::PhantomData<Method>,
}
impl<Method: AggregateMethod> SimilarAggregator<Method> {
#[allow(clippy::field_reassign_with_default)]
pub fn from_query(query: &SimilarQuery) -> Self {
let SimilarQuery {
id: _,
embedder: _,
offset,
limit,
attributes_to_retrieve: _,
retrieve_vectors,
show_ranking_score,
show_ranking_score_details,
filter,
ranking_score_threshold,
} = query;
let mut ret = Self::default();
ret.total_received = 1;
if let Some(ref filter) = filter {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
ret.filter_total_number_of_criteria = 1;
let syntax = match filter {
Value::String(_) => "string".to_string(),
Value::Array(values) => {
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
"mixed".to_string()
} else {
"array".to_string()
}
}
_ => "none".to_string(),
};
// convert the string to a HashMap
ret.used_syntax.insert(syntax, 1);
let stringified_filters = filter.to_string();
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
}
ret.max_limit = *limit;
ret.max_offset = *offset;
ret.show_ranking_score = *show_ranking_score;
ret.show_ranking_score_details = *show_ranking_score_details;
ret.ranking_score_threshold = ranking_score_threshold.is_some();
ret.retrieve_vectors = *retrieve_vectors;
ret
}
pub fn succeed(&mut self, result: &SimilarResult) {
let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result;
self.total_succeeded = self.total_succeeded.saturating_add(1);
self.time_spent.push(*processing_time_ms as usize);
}
}
impl<Method: AggregateMethod> Aggregate for SimilarAggregator<Method> {
fn event_name(&self) -> &'static str {
Method::event_name()
}
/// Aggregate one [SimilarAggregator] into another.
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
let Self {
total_received,
total_succeeded,
mut time_spent,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
filter_total_number_of_criteria,
used_syntax,
max_limit,
max_offset,
max_attributes_to_retrieve,
show_ranking_score,
show_ranking_score_details,
ranking_score_threshold,
retrieve_vectors,
marker: _,
} = *new;
// request
self.total_received = self.total_received.saturating_add(total_received);
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
self.time_spent.append(&mut time_spent);
// filter
self.filter_with_geo_radius |= filter_with_geo_radius;
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
self.filter_sum_of_criteria_terms =
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
self.filter_total_number_of_criteria =
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
for (key, value) in used_syntax.into_iter() {
let used_syntax = self.used_syntax.entry(key).or_insert(0);
*used_syntax = used_syntax.saturating_add(value);
}
self.retrieve_vectors |= retrieve_vectors;
// pagination
self.max_limit = self.max_limit.max(max_limit);
self.max_offset = self.max_offset.max(max_offset);
// formatting
self.max_attributes_to_retrieve =
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
// scoring
self.show_ranking_score |= show_ranking_score;
self.show_ranking_score_details |= show_ranking_score_details;
self.ranking_score_threshold |= ranking_score_threshold;
self
}
fn into_event(self: Box<Self>) -> serde_json::Value {
let Self {
total_received,
total_succeeded,
time_spent,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
filter_total_number_of_criteria,
used_syntax,
max_limit,
max_offset,
max_attributes_to_retrieve,
show_ranking_score,
show_ranking_score_details,
ranking_score_threshold,
retrieve_vectors,
marker: _,
} = *self;
// we get all the values in a sorted manner
let time_spent = time_spent.into_sorted_vec();
// the index of the 99th percentage of value
let percentile_99th = time_spent.len() * 99 / 100;
// We are only interested by the slowest value of the 99th fastest results
let time_spent = time_spent.get(percentile_99th);
json!({
"requests": {
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
"total_succeeded": total_succeeded,
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
"total_received": total_received,
},
"filter": {
"with_geoRadius": filter_with_geo_radius,
"with_geoBoundingBox": filter_with_geo_bounding_box,
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
},
"vector": {
"retrieve_vectors": retrieve_vectors,
},
"pagination": {
"max_limit": max_limit,
"max_offset": max_offset,
},
"formatting": {
"max_attributes_to_retrieve": max_attributes_to_retrieve,
},
"scoring": {
"show_ranking_score": show_ranking_score,
"show_ranking_score_details": show_ranking_score_details,
"ranking_score_threshold": ranking_score_threshold,
}
})
}
}

View File

@ -25,7 +25,6 @@ pub mod indexes;
mod logs; mod logs;
mod metrics; mod metrics;
mod multi_search; mod multi_search;
mod multi_search_analytics;
mod snapshot; mod snapshot;
mod swap_indexes; mod swap_indexes;
pub mod tasks; pub mod tasks;

View File

@ -9,8 +9,7 @@ use meilisearch_types::keys::actions;
use serde::Serialize; use serde::Serialize;
use tracing::debug; use tracing::debug;
use super::multi_search_analytics::MultiSearchAggregator; use crate::analytics::{Analytics, MultiSearchAggregator};
use crate::analytics::Analytics;
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::policies::ActionPolicy;
use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::authentication::{AuthenticationError, GuardedData};
@ -36,7 +35,7 @@ pub async fn multi_search_with_post(
search_queue: Data<SearchQueue>, search_queue: Data<SearchQueue>,
params: AwebJson<FederatedSearch, DeserrJsonError>, params: AwebJson<FederatedSearch, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
// Since we don't want to process half of the search requests and then get a permit refused // Since we don't want to process half of the search requests and then get a permit refused
// we're going to get one permit for the whole duration of the multi-search request. // we're going to get one permit for the whole duration of the multi-search request.
@ -44,7 +43,7 @@ pub async fn multi_search_with_post(
let federated_search = params.into_inner(); let federated_search = params.into_inner();
let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search); let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req);
let FederatedSearch { mut queries, federation } = federated_search; let FederatedSearch { mut queries, federation } = federated_search;
@ -88,7 +87,7 @@ pub async fn multi_search_with_post(
multi_aggregate.succeed(); multi_aggregate.succeed();
} }
analytics.publish(multi_aggregate, &req); analytics.post_multi_search(multi_aggregate);
HttpResponse::Ok().json(search_result??) HttpResponse::Ok().json(search_result??)
} }
None => { None => {
@ -150,7 +149,7 @@ pub async fn multi_search_with_post(
if search_results.is_ok() { if search_results.is_ok() {
multi_aggregate.succeed(); multi_aggregate.succeed();
} }
analytics.publish(multi_aggregate, &req); analytics.post_multi_search(multi_aggregate);
let search_results = search_results.map_err(|(mut err, query_index)| { let search_results = search_results.map_err(|(mut err, query_index)| {
// Add the query index that failed as context for the error message. // Add the query index that failed as context for the error message.

View File

@ -1,168 +0,0 @@
use std::collections::HashSet;
use serde_json::json;
use crate::analytics::Aggregate;
use crate::search::{FederatedSearch, SearchQueryWithIndex};
#[derive(Default)]
pub struct MultiSearchAggregator {
// requests
total_received: usize,
total_succeeded: usize,
// sum of the number of distinct indexes in each single request, use with total_received to compute an avg
total_distinct_index_count: usize,
// number of queries with a single index, use with total_received to compute a proportion
total_single_index: usize,
// sum of the number of search queries in the requests, use with total_received to compute an average
total_search_count: usize,
// scoring
show_ranking_score: bool,
show_ranking_score_details: bool,
// federation
use_federation: bool,
}
impl MultiSearchAggregator {
pub fn from_federated_search(federated_search: &FederatedSearch) -> Self {
let use_federation = federated_search.federation.is_some();
let distinct_indexes: HashSet<_> = federated_search
.queries
.iter()
.map(|query| {
let query = &query;
// make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex
let SearchQueryWithIndex {
index_uid,
federation_options: _,
q: _,
vector: _,
offset: _,
limit: _,
page: _,
hits_per_page: _,
attributes_to_retrieve: _,
retrieve_vectors: _,
attributes_to_crop: _,
crop_length: _,
attributes_to_highlight: _,
show_ranking_score: _,
show_ranking_score_details: _,
show_matches_position: _,
filter: _,
sort: _,
distinct: _,
facets: _,
highlight_pre_tag: _,
highlight_post_tag: _,
crop_marker: _,
matching_strategy: _,
attributes_to_search_on: _,
hybrid: _,
ranking_score_threshold: _,
locales: _,
} = query;
index_uid.as_str()
})
.collect();
let show_ranking_score =
federated_search.queries.iter().any(|query| query.show_ranking_score);
let show_ranking_score_details =
federated_search.queries.iter().any(|query| query.show_ranking_score_details);
Self {
total_received: 1,
total_succeeded: 0,
total_distinct_index_count: distinct_indexes.len(),
total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 },
total_search_count: federated_search.queries.len(),
show_ranking_score,
show_ranking_score_details,
use_federation,
}
}
pub fn succeed(&mut self) {
self.total_succeeded = self.total_succeeded.saturating_add(1);
}
}
impl Aggregate for MultiSearchAggregator {
fn event_name(&self) -> &'static str {
"Documents Searched by Multi-Search POST"
}
/// Aggregate one [MultiSearchAggregator] into another.
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
// write the aggregate in a way that will cause a compilation error if a field is added.
// get ownership of self, replacing it by a default value.
let this = *self;
let total_received = this.total_received.saturating_add(new.total_received);
let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded);
let total_distinct_index_count =
this.total_distinct_index_count.saturating_add(new.total_distinct_index_count);
let total_single_index = this.total_single_index.saturating_add(new.total_single_index);
let total_search_count = this.total_search_count.saturating_add(new.total_search_count);
let show_ranking_score = this.show_ranking_score || new.show_ranking_score;
let show_ranking_score_details =
this.show_ranking_score_details || new.show_ranking_score_details;
let use_federation = this.use_federation || new.use_federation;
Box::new(Self {
total_received,
total_succeeded,
total_distinct_index_count,
total_single_index,
total_search_count,
show_ranking_score,
show_ranking_score_details,
use_federation,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
let Self {
total_received,
total_succeeded,
total_distinct_index_count,
total_single_index,
total_search_count,
show_ranking_score,
show_ranking_score_details,
use_federation,
} = *self;
json!({
"requests": {
"total_succeeded": total_succeeded,
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
"total_received": total_received,
},
"indexes": {
"total_single_index": total_single_index,
"total_distinct_index_count": total_distinct_index_count,
"avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early
},
"searches": {
"total_search_count": total_search_count,
"avg_search_count": (total_search_count as f64) / (total_received as f64),
},
"scoring": {
"show_ranking_score": show_ranking_score,
"show_ranking_score_details": show_ranking_score_details,
},
"federation": {
"use_federation": use_federation,
}
})
}
}

View File

@ -3,6 +3,7 @@ use actix_web::{web, HttpRequest, HttpResponse};
use index_scheduler::IndexScheduler; use index_scheduler::IndexScheduler;
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
use serde_json::json;
use tracing::debug; use tracing::debug;
use crate::analytics::Analytics; use crate::analytics::Analytics;
@ -16,15 +17,13 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
} }
crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created");
pub async fn create_snapshot( pub async fn create_snapshot(
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
analytics.publish(SnapshotAnalytics::default(), &req); analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
let task = KindWithContent::SnapshotCreation; let task = KindWithContent::SnapshotCreation;
let uid = get_task_id(&req, &opt)?; let uid = get_task_id(&req, &opt)?;

View File

@ -8,10 +8,10 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes;
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use meilisearch_types::tasks::{IndexSwap, KindWithContent};
use serde::Serialize; use serde_json::json;
use super::{get_task_id, is_dry_run, SummarizedTaskView}; use super::{get_task_id, is_dry_run, SummarizedTaskView};
use crate::analytics::{Aggregate, Analytics}; use crate::analytics::Analytics;
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::authentication::{AuthenticationError, GuardedData};
@ -29,36 +29,21 @@ pub struct SwapIndexesPayload {
indexes: Vec<IndexUid>, indexes: Vec<IndexUid>,
} }
#[derive(Serialize)]
struct IndexSwappedAnalytics {
swap_operation_number: usize,
}
impl Aggregate for IndexSwappedAnalytics {
fn event_name(&self) -> &'static str {
"Indexes Swapped"
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
swap_operation_number: self.swap_operation_number.max(new.swap_operation_number),
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
pub async fn swap_indexes( pub async fn swap_indexes(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>, params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner(); let params = params.into_inner();
analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req); analytics.publish(
"Indexes Swapped".to_string(),
json!({
"swap_operation_number": params.len(),
}),
Some(&req),
);
let filters = index_scheduler.filters(); let filters = index_scheduler.filters();
let mut swaps = vec![]; let mut swaps = vec![];

View File

@ -12,17 +12,18 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList};
use meilisearch_types::task_view::TaskView; use meilisearch_types::task_view::TaskView;
use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use meilisearch_types::tasks::{Kind, KindWithContent, Status};
use serde::Serialize; use serde::Serialize;
use serde_json::json;
use time::format_description::well_known::Rfc3339; use time::format_description::well_known::Rfc3339;
use time::macros::format_description; use time::macros::format_description;
use time::{Date, Duration, OffsetDateTime, Time}; use time::{Date, Duration, OffsetDateTime, Time};
use tokio::task; use tokio::task;
use super::{get_task_id, is_dry_run, SummarizedTaskView}; use super::{get_task_id, is_dry_run, SummarizedTaskView};
use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::{aggregate_methods, Opt}; use crate::Opt;
const DEFAULT_LIMIT: u32 = 20; const DEFAULT_LIMIT: u32 = 20;
@ -157,69 +158,12 @@ impl TaskDeletionOrCancelationQuery {
} }
} }
aggregate_methods!(
CancelTasks => "Tasks Canceled",
DeleteTasks => "Tasks Deleted",
);
#[derive(Serialize)]
struct TaskFilterAnalytics<Method: AggregateMethod> {
filtered_by_uid: bool,
filtered_by_index_uid: bool,
filtered_by_type: bool,
filtered_by_status: bool,
filtered_by_canceled_by: bool,
filtered_by_before_enqueued_at: bool,
filtered_by_after_enqueued_at: bool,
filtered_by_before_started_at: bool,
filtered_by_after_started_at: bool,
filtered_by_before_finished_at: bool,
filtered_by_after_finished_at: bool,
#[serde(skip)]
marker: std::marker::PhantomData<Method>,
}
impl<Method: AggregateMethod + 'static> Aggregate for TaskFilterAnalytics<Method> {
fn event_name(&self) -> &'static str {
Method::event_name()
}
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
Box::new(Self {
filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid,
filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid,
filtered_by_type: self.filtered_by_type | new.filtered_by_type,
filtered_by_status: self.filtered_by_status | new.filtered_by_status,
filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by,
filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at
| new.filtered_by_before_enqueued_at,
filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at
| new.filtered_by_after_enqueued_at,
filtered_by_before_started_at: self.filtered_by_before_started_at
| new.filtered_by_before_started_at,
filtered_by_after_started_at: self.filtered_by_after_started_at
| new.filtered_by_after_started_at,
filtered_by_before_finished_at: self.filtered_by_before_finished_at
| new.filtered_by_before_finished_at,
filtered_by_after_finished_at: self.filtered_by_after_finished_at
| new.filtered_by_after_finished_at,
marker: std::marker::PhantomData,
})
}
fn into_event(self: Box<Self>) -> serde_json::Value {
serde_json::to_value(*self).unwrap_or_default()
}
}
async fn cancel_tasks( async fn cancel_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>, params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner(); let params = params.into_inner();
@ -228,22 +172,21 @@ async fn cancel_tasks(
} }
analytics.publish( analytics.publish(
TaskFilterAnalytics::<CancelTasks> { "Tasks Canceled".to_string(),
filtered_by_uid: params.uids.is_some(), json!({
filtered_by_index_uid: params.index_uids.is_some(), "filtered_by_uid": params.uids.is_some(),
filtered_by_type: params.types.is_some(), "filtered_by_index_uid": params.index_uids.is_some(),
filtered_by_status: params.statuses.is_some(), "filtered_by_type": params.types.is_some(),
filtered_by_canceled_by: params.canceled_by.is_some(), "filtered_by_status": params.statuses.is_some(),
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), "filtered_by_canceled_by": params.canceled_by.is_some(),
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
filtered_by_before_started_at: params.before_started_at.is_some(), "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
filtered_by_after_started_at: params.after_started_at.is_some(), "filtered_by_before_started_at": params.before_started_at.is_some(),
filtered_by_before_finished_at: params.before_finished_at.is_some(), "filtered_by_after_started_at": params.after_started_at.is_some(),
filtered_by_after_finished_at: params.after_finished_at.is_some(), "filtered_by_before_finished_at": params.before_finished_at.is_some(),
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
marker: std::marker::PhantomData, }),
}, Some(&req),
&req,
); );
let query = params.into_query(); let query = params.into_query();
@ -271,7 +214,7 @@ async fn delete_tasks(
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>, params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
req: HttpRequest, req: HttpRequest,
opt: web::Data<Opt>, opt: web::Data<Opt>,
analytics: web::Data<Analytics>, analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner(); let params = params.into_inner();
@ -280,24 +223,22 @@ async fn delete_tasks(
} }
analytics.publish( analytics.publish(
TaskFilterAnalytics::<DeleteTasks> { "Tasks Deleted".to_string(),
filtered_by_uid: params.uids.is_some(), json!({
filtered_by_index_uid: params.index_uids.is_some(), "filtered_by_uid": params.uids.is_some(),
filtered_by_type: params.types.is_some(), "filtered_by_index_uid": params.index_uids.is_some(),
filtered_by_status: params.statuses.is_some(), "filtered_by_type": params.types.is_some(),
filtered_by_canceled_by: params.canceled_by.is_some(), "filtered_by_status": params.statuses.is_some(),
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), "filtered_by_canceled_by": params.canceled_by.is_some(),
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
filtered_by_before_started_at: params.before_started_at.is_some(), "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
filtered_by_after_started_at: params.after_started_at.is_some(), "filtered_by_before_started_at": params.before_started_at.is_some(),
filtered_by_before_finished_at: params.before_finished_at.is_some(), "filtered_by_after_started_at": params.after_started_at.is_some(),
filtered_by_after_finished_at: params.after_finished_at.is_some(), "filtered_by_before_finished_at": params.before_finished_at.is_some(),
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
marker: std::marker::PhantomData, }),
}, Some(&req),
&req,
); );
let query = params.into_query(); let query = params.into_query();
let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes(

View File

@ -1195,13 +1195,8 @@ impl<'a> HitMaker<'a> {
let vectors_is_hidden = match (&displayed_ids, vectors_fid) { let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
(None, _) => false, (None, _) => false,
// vectors has no fid, so check its explicit name // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
(Some(_), None) => { (Some(_), None) => true,
// unwrap as otherwise we'd go to the first one
let displayed_names = index.displayed_fields(rtxn)?.unwrap();
!displayed_names
.contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME)
}
// displayed_ids is a finit list, so hide if `_vectors` is not part of it // displayed_ids is a finit list, so hide if `_vectors` is not part of it
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
}; };

View File

@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() {
meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###"
{ {
"message": "Unknown value `doc.add` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`", "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `metrics.get`, `dumps.create`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`",
"code": "invalid_api_key_actions", "code": "invalid_api_key_actions",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"

View File

@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Unknown value `doggo` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`", "message": "Unknown value `doggo` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `metrics.get`, `dumps.create`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`",
"code": "invalid_api_key_actions", "code": "invalid_api_key_actions",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"

View File

@ -9,7 +9,8 @@ use urlencoding::encode as urlencode;
use super::encoder::Encoder; use super::encoder::Encoder;
use super::service::Service; use super::service::Service;
use super::{Owned, Shared, Value}; use super::Value;
use super::{Owned, Shared};
use crate::json; use crate::json;
pub struct Index<'a, State = Owned> { pub struct Index<'a, State = Owned> {

View File

@ -386,6 +386,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
db_path: dir.as_ref().join("db"), db_path: dir.as_ref().join("db"),
dump_dir: dir.as_ref().join("dumps"), dump_dir: dir.as_ref().join("dumps"),
env: "development".to_owned(), env: "development".to_owned(),
#[cfg(feature = "analytics")]
no_analytics: true, no_analytics: true,
max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(), max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(),
max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(), max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(),

View File

@ -9,9 +9,8 @@ use actix_web::test;
use actix_web::test::TestRequest; use actix_web::test::TestRequest;
use actix_web::web::Data; use actix_web::web::Data;
use index_scheduler::IndexScheduler; use index_scheduler::IndexScheduler;
use meilisearch::analytics::Analytics;
use meilisearch::search_queue::SearchQueue; use meilisearch::search_queue::SearchQueue;
use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
use meilisearch_auth::AuthController; use meilisearch_auth::AuthController;
use tracing::level_filters::LevelFilter; use tracing::level_filters::LevelFilter;
use tracing_subscriber::Layer; use tracing_subscriber::Layer;
@ -142,7 +141,7 @@ impl Service {
Data::new(search_queue), Data::new(search_queue),
self.options.clone(), self.options.clone(),
(route_layer_handle, stderr_layer_handle), (route_layer_handle, stderr_layer_handle),
Data::new(Analytics::no_analytics()), analytics::MockAnalytics::new(&self.options),
true, true,
)) ))
.await .await

View File

@ -23,8 +23,8 @@ async fn error_document_update_create_index_bad_uid() {
#[actix_rt::test] #[actix_rt::test]
async fn document_update_with_primary_key() { async fn document_update_with_primary_key() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
let documents = json!([ let documents = json!([
{ {
@ -32,14 +32,15 @@ async fn document_update_with_primary_key() {
"content": "foo", "content": "foo",
} }
]); ]);
let (response, code) = index.update_documents(documents, Some("primary")).await; let (_response, code) = index.update_documents(documents, Some("primary")).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(0).await;
let (response, code) = index.get_task(response.uid()).await; let (response, code) = index.get_task(0).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["status"], "succeeded"); assert_eq!(response["status"], "succeeded");
assert_eq!(response["uid"], 0);
assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["type"], "documentAdditionOrUpdate");
assert_eq!(response["details"]["indexedDocuments"], 1); assert_eq!(response["details"]["indexedDocuments"], 1);
assert_eq!(response["details"]["receivedDocuments"], 1); assert_eq!(response["details"]["receivedDocuments"], 1);
@ -51,8 +52,8 @@ async fn document_update_with_primary_key() {
#[actix_rt::test] #[actix_rt::test]
async fn update_document() { async fn update_document() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
let documents = json!([ let documents = json!([
{ {
@ -61,10 +62,10 @@ async fn update_document() {
} }
]); ]);
let (response, code) = index.add_documents(documents, None).await; let (_response, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(0).await;
let documents = json!([ let documents = json!([
{ {
@ -76,9 +77,9 @@ async fn update_document() {
let (response, code) = index.update_documents(documents, None).await; let (response, code) = index.update_documents(documents, None).await;
assert_eq!(code, 202, "response: {}", response); assert_eq!(code, 202, "response: {}", response);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(1).await;
let (response, code) = index.get_task(response.uid()).await; let (response, code) = index.get_task(1).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["status"], "succeeded"); assert_eq!(response["status"], "succeeded");
@ -95,8 +96,8 @@ async fn update_document() {
#[actix_rt::test] #[actix_rt::test]
async fn update_document_gzip_encoded() { async fn update_document_gzip_encoded() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index_with_encoder(Encoder::Gzip); let index = server.index_with_encoder("test", Encoder::Gzip);
let documents = json!([ let documents = json!([
{ {
@ -105,10 +106,10 @@ async fn update_document_gzip_encoded() {
} }
]); ]);
let (response, code) = index.add_documents(documents, None).await; let (_response, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(0).await;
let documents = json!([ let documents = json!([
{ {
@ -120,9 +121,9 @@ async fn update_document_gzip_encoded() {
let (response, code) = index.update_documents(documents, None).await; let (response, code) = index.update_documents(documents, None).await;
assert_eq!(code, 202, "response: {}", response); assert_eq!(code, 202, "response: {}", response);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(1).await;
let (response, code) = index.get_task(response.uid()).await; let (response, code) = index.get_task(1).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["status"], "succeeded"); assert_eq!(response["status"], "succeeded");
@ -139,12 +140,12 @@ async fn update_document_gzip_encoded() {
#[actix_rt::test] #[actix_rt::test]
async fn update_larger_dataset() { async fn update_larger_dataset() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(); let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap();
let (task, _code) = index.update_documents(documents, None).await; index.update_documents(documents, None).await;
index.wait_task(task.uid()).await.succeeded(); index.wait_task(0).await;
let (response, code) = index.get_task(task.uid()).await; let (response, code) = index.get_task(0).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["type"], "documentAdditionOrUpdate");
assert_eq!(response["details"]["indexedDocuments"], 77); assert_eq!(response["details"]["indexedDocuments"], 77);
@ -157,8 +158,8 @@ async fn update_larger_dataset() {
#[actix_rt::test] #[actix_rt::test]
async fn error_update_documents_bad_document_id() { async fn error_update_documents_bad_document_id() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
index.create(Some("docid")).await; index.create(Some("docid")).await;
let documents = json!([ let documents = json!([
{ {
@ -166,8 +167,8 @@ async fn error_update_documents_bad_document_id() {
"content": "foobar" "content": "foobar"
} }
]); ]);
let (task, _code) = index.update_documents(documents, None).await; index.update_documents(documents, None).await;
let response = index.wait_task(task.uid()).await; let response = index.wait_task(1).await;
assert_eq!(response["status"], json!("failed")); assert_eq!(response["status"], json!("failed"));
assert_eq!( assert_eq!(
response["error"]["message"], response["error"]["message"],
@ -185,8 +186,8 @@ async fn error_update_documents_bad_document_id() {
#[actix_rt::test] #[actix_rt::test]
async fn error_update_documents_missing_document_id() { async fn error_update_documents_missing_document_id() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
index.create(Some("docid")).await; index.create(Some("docid")).await;
let documents = json!([ let documents = json!([
{ {
@ -194,8 +195,8 @@ async fn error_update_documents_missing_document_id() {
"content": "foobar" "content": "foobar"
} }
]); ]);
let (task, _code) = index.update_documents(documents, None).await; index.update_documents(documents, None).await;
let response = index.wait_task(task.uid()).await; let response = index.wait_task(1).await;
assert_eq!(response["status"], "failed"); assert_eq!(response["status"], "failed");
assert_eq!( assert_eq!(
response["error"]["message"], response["error"]["message"],
@ -211,8 +212,8 @@ async fn error_update_documents_missing_document_id() {
#[actix_rt::test] #[actix_rt::test]
async fn update_faceted_document() { async fn update_faceted_document() {
let server = Server::new_shared(); let server = Server::new().await;
let index = server.unique_index(); let index = server.index("test");
let (response, code) = index let (response, code) = index
.update_settings(json!({ .update_settings(json!({
@ -220,7 +221,7 @@ async fn update_faceted_document() {
})) }))
.await; .await;
assert_eq!("202", code.as_str(), "{:?}", response); assert_eq!("202", code.as_str(), "{:?}", response);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(0).await;
let documents: Vec<_> = (0..1000) let documents: Vec<_> = (0..1000)
.map(|id| { .map(|id| {
@ -231,10 +232,10 @@ async fn update_faceted_document() {
}) })
.collect(); .collect();
let (response, code) = index.add_documents(documents.into(), None).await; let (_response, code) = index.add_documents(documents.into(), None).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(1).await;
let documents = json!([ let documents = json!([
{ {
@ -246,7 +247,7 @@ async fn update_faceted_document() {
let (response, code) = index.update_documents(documents, None).await; let (response, code) = index.update_documents(documents, None).await;
assert_eq!(code, 202, "response: {}", response); assert_eq!(code, 202, "response: {}", response);
index.wait_task(response.uid()).await.succeeded(); index.wait_task(2).await;
index index
.search(json!({"limit": 10}), |response, code| { .search(json!({"limit": 10}), |response, code| {

View File

@ -7,9 +7,8 @@ use std::str::FromStr;
use actix_web::http::header::ContentType; use actix_web::http::header::ContentType;
use actix_web::web::Data; use actix_web::web::Data;
use meili_snap::snapshot; use meili_snap::snapshot;
use meilisearch::analytics::Analytics;
use meilisearch::search_queue::SearchQueue; use meilisearch::search_queue::SearchQueue;
use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
use tracing::level_filters::LevelFilter; use tracing::level_filters::LevelFilter;
use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::Layer; use tracing_subscriber::Layer;
@ -55,7 +54,7 @@ async fn basic_test_log_stream_route() {
Data::new(search_queue), Data::new(search_queue),
server.service.options.clone(), server.service.options.clone(),
(route_layer_handle, stderr_layer_handle), (route_layer_handle, stderr_layer_handle),
Data::new(Analytics::no_analytics()), analytics::MockAnalytics::new(&server.service.options),
true, true,
)) ))
.await; .await;

View File

@ -568,57 +568,6 @@ async fn retrieve_vectors() {
] ]
"###); "###);
// use explicit `_vectors` in displayed attributes
let (response, code) = index
.update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} ))
.await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
}
]
"###);
// remove `_vectors` from displayed attributes // remove `_vectors` from displayed attributes
let (response, code) = let (response, code) =
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;

View File

@ -4,53 +4,6 @@ use crate::common::{GetAllDocumentsOptions, Server};
use crate::json; use crate::json;
use crate::vector::generate_default_user_provided_documents; use crate::vector::generate_default_user_provided_documents;
#[actix_rt::test]
async fn field_unavailable_for_source() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false,
"editDocumentsByFunction": false,
"containsFilter": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}},
}))
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`",
"code": "invalid_settings_embedders",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": { "default": {"source": "openAi", "revision": "42"}},
}))
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`",
"code": "invalid_settings_embedders",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
}
"###);
}
#[actix_rt::test] #[actix_rt::test]
async fn update_embedder() { async fn update_embedder() {
let server = Server::new().await; let server = Server::new().await;

View File

@ -16,6 +16,5 @@ file-store = { path = "../file-store" }
meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" } meilisearch-types = { path = "../meilisearch-types" }
serde = { version = "1.0.209", features = ["derive"] } serde = { version = "1.0.209", features = ["derive"] }
time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } time = { version = "0.3.36", features = ["formatting"] }
uuid = { version = "1.10.0", features = ["v4"], default-features = false } uuid = { version = "1.10.0", features = ["v4"], default-features = false }
arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" }

View File

@ -2,7 +2,7 @@ use std::fs::{read_dir, read_to_string, remove_file, File};
use std::io::BufWriter; use std::io::BufWriter;
use std::path::PathBuf; use std::path::PathBuf;
use anyhow::Context; use anyhow::{bail, Context};
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use dump::{DumpWriter, IndexMetadata}; use dump::{DumpWriter, IndexMetadata};
use file_store::FileStore; use file_store::FileStore;
@ -10,16 +10,15 @@ use meilisearch_auth::AuthController;
use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::index::{db_name, main_key};
use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::milli::{obkv_to_json, BEU32};
use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::tasks::{Status, Task};
use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::versioning::{create_version_file, get_version, parse_version};
use meilisearch_types::Index; use meilisearch_types::Index;
use time::macros::format_description; use time::macros::format_description;
use time::OffsetDateTime; use time::OffsetDateTime;
use upgrade::OfflineUpgrade;
use uuid_codec::UuidCodec; use uuid_codec::UuidCodec;
mod upgrade;
mod uuid_codec; mod uuid_codec;
#[derive(Parser)] #[derive(Parser)]
@ -73,7 +72,7 @@ enum Command {
/// ///
/// Supported upgrade paths: /// Supported upgrade paths:
/// ///
/// - v1.9.x -> v1.10.x -> v1.11.x /// - v1.9.0 -> v1.10.0
OfflineUpgrade { OfflineUpgrade {
#[arg(long)] #[arg(long)]
target_version: String, target_version: String,
@ -97,6 +96,425 @@ fn main() -> anyhow::Result<()> {
} }
} }
struct OfflineUpgrade {
db_path: PathBuf,
current_version: (String, String, String),
target_version: (String, String, String),
}
impl OfflineUpgrade {
fn upgrade(self) -> anyhow::Result<()> {
// TODO: if we make this process support more versions, introduce a more flexible way of checking for the version
// currently only supports v1.9 to v1.10
let (current_major, current_minor, current_patch) = &self.current_version;
match (current_major.as_str(), current_minor.as_str(), current_patch.as_str()) {
("1", "9", _) => {}
_ => {
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9")
}
}
let (target_major, target_minor, target_patch) = &self.target_version;
match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
("1", "10", _) => {}
_ => {
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10")
}
}
println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
self.v1_9_to_v1_10()?;
println!("Writing VERSION file");
create_version_file(&self.db_path, target_major, target_minor, target_patch)
.context("while writing VERSION file after the upgrade")?;
println!("Success");
Ok(())
}
fn v1_9_to_v1_10(&self) -> anyhow::Result<()> {
// 2 changes here
// 1. date format. needs to be done before opening the Index
// 2. REST embedders. We don't support this case right now, so bail
let index_scheduler_path = self.db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| {
format!("While trying to open {:?}", index_scheduler_path.display())
})?;
let mut sched_wtxn = env.write_txn()?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
let index_stats: Database<UuidCodec, Unspecified> =
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
format!("While trying to open {:?}", index_scheduler_path.display())
})?;
let index_count =
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
// 1. immutably for the iteration
// 2. mutably for updating index stats
let indexes: Vec<_> = index_mapping
.iter(&sched_wtxn)?
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
.collect();
let mut rest_embedders = Vec::new();
let mut unwrapped_indexes = Vec::new();
// check that update can take place
for (index_index, result) in indexes.into_iter().enumerate() {
let (uid, uuid) = result?;
let index_path = self.db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let index_txn = index_env.read_txn().with_context(|| {
format!(
"while obtaining a write transaction for index {uid} at {}",
index_path.display()
)
})?;
println!("\t- Checking for incompatible embedders (REST embedders)");
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
if rest_embedders_for_index.is_empty() {
unwrapped_indexes.push((uid, uuid));
} else {
// no need to add to unwrapped indexes because we'll exit early
rest_embedders.push((uid, rest_embedders_for_index));
}
}
if !rest_embedders.is_empty() {
let rest_embedders = rest_embedders
.into_iter()
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
.collect::<Vec<_>>()
.join("\n");
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
The database has not been modified and is still a valid v1.9 database.");
}
println!("Update can take place, updating");
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
let index_path = self.db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Updating index `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let mut index_wtxn = index_env.write_txn().with_context(|| {
format!(
"while obtaining a write transaction for index `{uid}` at `{}`",
index_path.display()
)
})?;
println!("\t- Updating index stats");
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
println!("\t- Updating date format");
update_date_format(&uid, &index_env, &mut index_wtxn)?;
index_wtxn.commit().with_context(|| {
format!(
"while committing the write txn for index `{uid}` at {}",
index_path.display()
)
})?;
}
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
println!("Upgrading database succeeded");
Ok(())
}
}
pub mod v1_9 {
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
pub created_at: time::OffsetDateTime,
/// Date of the last update of the index.
pub updated_at: time::OffsetDateTime,
}
use serde::{Deserialize, Serialize};
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
}
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
pub struct EmbeddingConfig {
/// Options of the embedder, specific to each kind of embedder
pub embedder_options: EmbedderOptions,
}
/// Options of an embedder, specific to each kind of embedder.
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub enum EmbedderOptions {
HuggingFace(hf::EmbedderOptions),
OpenAi(openai::EmbedderOptions),
Ollama(ollama::EmbedderOptions),
UserProvided(manual::EmbedderOptions),
Rest(rest::EmbedderOptions),
}
impl Default for EmbedderOptions {
fn default() -> Self {
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
}
}
mod hf {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub model: String,
pub revision: Option<String>,
}
}
mod openai {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
}
}
mod ollama {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub embedding_model: String,
pub url: Option<String>,
pub api_key: Option<String>,
}
}
mod manual {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub dimensions: usize,
}
}
mod rest {
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
pub url: String,
pub input_field: Vec<String>,
// path to the array of embeddings
pub path_to_embeddings: Vec<String>,
// shape of a single embedding
pub embedding_object: Vec<String>,
}
}
pub type OffsetDateTime = time::OffsetDateTime;
}
pub mod v1_10 {
use crate::v1_9;
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
#[serde(with = "time::serde::rfc3339")]
pub created_at: time::OffsetDateTime,
/// Date of the last update of the index.
#[serde(with = "time::serde::rfc3339")]
pub updated_at: time::OffsetDateTime,
}
impl From<v1_9::IndexStats> for IndexStats {
fn from(
v1_9::IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at,
updated_at,
}: v1_9::IndexStats,
) -> Self {
IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at,
updated_at,
}
}
}
#[derive(serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
}
fn update_index_stats(
index_stats: Database<UuidCodec, Unspecified>,
index_uid: &str,
index_uuid: uuid::Uuid,
sched_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let ctx = || format!("while updating index stats for index `{index_uid}`");
let stats: Option<v1_9::IndexStats> = index_stats
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
.get(sched_wtxn, &index_uuid)
.with_context(ctx)?;
if let Some(stats) = stats {
let stats: v1_10::IndexStats = stats.into();
index_stats
.remap_data_type::<SerdeJson<v1_10::IndexStats>>()
.put(sched_wtxn, &index_uuid, &stats)
.with_context(ctx)?;
}
Ok(())
}
fn update_date_format(
index_uid: &str,
index_env: &Env,
index_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
Ok(())
}
fn find_rest_embedders(
index_uid: &str,
index_env: &Env,
index_txn: &RoTxn,
) -> anyhow::Result<Vec<String>> {
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
let mut rest_embedders = vec![];
for config in main
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()
{
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
rest_embedders.push(config.name);
}
}
Ok(rest_embedders)
}
fn date_round_trip(
wtxn: &mut RwTxn,
index_uid: &str,
db: Database<Unspecified, Unspecified>,
key: &str,
) -> anyhow::Result<()> {
let datetime =
db.remap_types::<Str, SerdeJson<v1_9::OffsetDateTime>>().get(wtxn, key).with_context(
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
)?;
if let Some(datetime) = datetime {
db.remap_types::<Str, SerdeJson<v1_10::OffsetDateTime>>()
.put(wtxn, key, &v1_10::OffsetDateTime(datetime))
.with_context(|| {
format!(
"could not write `{key}` while updating date format for index `{index_uid}`"
)
})?;
}
Ok(())
}
/// Clears the task queue located at `db_path`. /// Clears the task queue located at `db_path`.
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
let path = db_path.join("tasks"); let path = db_path.join("tasks");

View File

@ -1,73 +0,0 @@
mod v1_10;
mod v1_11;
mod v1_9;
use std::path::{Path, PathBuf};
use anyhow::{bail, Context};
use meilisearch_types::versioning::create_version_file;
use v1_10::v1_9_to_v1_10;
use crate::upgrade::v1_11::v1_10_to_v1_11;
pub struct OfflineUpgrade {
pub db_path: PathBuf,
pub current_version: (String, String, String),
pub target_version: (String, String, String),
}
impl OfflineUpgrade {
pub fn upgrade(self) -> anyhow::Result<()> {
let upgrade_list = [
(v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"),
(v1_10_to_v1_11, "1", "11", "0"),
];
let (current_major, current_minor, current_patch) = &self.current_version;
let start_at = match (
current_major.as_str(),
current_minor.as_str(),
current_patch.as_str(),
) {
("1", "9", _) => 0,
("1", "10", _) => 1,
_ => {
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
}
};
let (target_major, target_minor, target_patch) = &self.target_version;
let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
("1", "10", _) => 0,
("1", "11", _) => 1,
(major, _, _) if major.starts_with('v') => {
bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
}
_ => {
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11")
}
};
println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
#[allow(clippy::needless_range_loop)]
for index in start_at..=ends_at {
let (func, major, minor, patch) = upgrade_list[index];
(func)(&self.db_path)?;
println!("Done");
// We're writing the version file just in case an issue arise _while_ upgrading.
// We don't want the DB to fail in an unknown state.
println!("Writing VERSION file");
create_version_file(&self.db_path, major, minor, patch)
.context("while writing VERSION file after the upgrade")?;
}
println!("Success");
Ok(())
}
}

View File

@ -1,289 +0,0 @@
use anyhow::bail;
use std::path::Path;
use anyhow::Context;
use meilisearch_types::{
heed::{
types::{SerdeJson, Str},
Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
},
milli::index::{db_name, main_key},
};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
use super::v1_9;
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
#[serde(with = "time::serde::rfc3339")]
pub created_at: time::OffsetDateTime,
/// Date of the last update of the index.
#[serde(with = "time::serde::rfc3339")]
pub updated_at: time::OffsetDateTime,
}
impl From<v1_9::IndexStats> for IndexStats {
fn from(
v1_9::IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at,
updated_at,
}: v1_9::IndexStats,
) -> Self {
IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at: created_at.0,
updated_at: updated_at.0,
}
}
}
#[derive(serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
fn update_index_stats(
index_stats: Database<UuidCodec, Unspecified>,
index_uid: &str,
index_uuid: uuid::Uuid,
sched_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let ctx = || format!("while updating index stats for index `{index_uid}`");
let stats: Option<&str> = index_stats
.remap_data_type::<Str>()
.get(sched_wtxn, &index_uuid)
.with_context(ctx)
.with_context(|| "While reading value")?;
dbg!(stats);
let stats: Option<v1_9::IndexStats> = index_stats
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
.get(sched_wtxn, &index_uuid)
.with_context(ctx)
.with_context(|| "While reading value")?;
if let Some(stats) = stats {
let stats: self::IndexStats = stats.into();
index_stats
.remap_data_type::<SerdeJson<self::IndexStats>>()
.put(sched_wtxn, &index_uuid, &stats)
.with_context(ctx)
.with_context(|| "While writing value")?;
}
Ok(())
}
fn update_date_format(
index_uid: &str,
index_env: &Env,
index_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
Ok(())
}
fn find_rest_embedders(
index_uid: &str,
index_env: &Env,
index_txn: &RoTxn,
) -> anyhow::Result<Vec<String>> {
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
let mut rest_embedders = vec![];
for config in main
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()
{
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
rest_embedders.push(config.name);
}
}
Ok(rest_embedders)
}
fn date_round_trip(
wtxn: &mut RwTxn,
index_uid: &str,
db: Database<Unspecified, Unspecified>,
key: &str,
) -> anyhow::Result<()> {
let datetime =
db.remap_types::<Str, SerdeJson<v1_9::LegacyDateTime>>().get(wtxn, key).with_context(
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
)?;
if let Some(datetime) = datetime {
db.remap_types::<Str, SerdeJson<self::OffsetDateTime>>()
.put(wtxn, key, &self::OffsetDateTime(datetime.0))
.with_context(|| {
format!(
"could not write `{key}` while updating date format for index `{index_uid}`"
)
})?;
}
Ok(())
}
pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.9.0 to v1.10.0");
// 2 changes here
// 1. date format. needs to be done before opening the Index
// 2. REST embedders. We don't support this case right now, so bail
let index_scheduler_path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
let mut sched_wtxn = env.write_txn()?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
let index_stats: Database<UuidCodec, Unspecified> =
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
format!("While trying to open {:?}", index_scheduler_path.display())
})?;
let index_count =
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
// 1. immutably for the iteration
// 2. mutably for updating index stats
let indexes: Vec<_> = index_mapping
.iter(&sched_wtxn)?
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
.collect();
let mut rest_embedders = Vec::new();
let mut unwrapped_indexes = Vec::new();
// check that update can take place
for (index_index, result) in indexes.into_iter().enumerate() {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let index_txn = index_env.read_txn().with_context(|| {
format!(
"while obtaining a write transaction for index {uid} at {}",
index_path.display()
)
})?;
println!("\t- Checking for incompatible embedders (REST embedders)");
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
if rest_embedders_for_index.is_empty() {
unwrapped_indexes.push((uid, uuid));
} else {
// no need to add to unwrapped indexes because we'll exit early
rest_embedders.push((uid, rest_embedders_for_index));
}
}
if !rest_embedders.is_empty() {
let rest_embedders = rest_embedders
.into_iter()
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
.collect::<Vec<_>>()
.join("\n");
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
The database has not been modified and is still a valid v1.9 database.");
}
println!("Update can take place, updating");
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Updating index `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let mut index_wtxn = index_env.write_txn().with_context(|| {
format!(
"while obtaining a write transaction for index `{uid}` at `{}`",
index_path.display()
)
})?;
println!("\t- Updating index stats");
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
println!("\t- Updating date format");
update_date_format(&uid, &index_env, &mut index_wtxn)?;
index_wtxn.commit().with_context(|| {
format!("while committing the write txn for index `{uid}` at {}", index_path.display())
})?;
}
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
println!("Upgrading database succeeded");
Ok(())
}

View File

@ -1,85 +0,0 @@
//! The breaking changes that happened between the v1.10 and the v1.11 are:
//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0
//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata.
//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB.
//! - Store the list of updated IDs directly in LMDBinstead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything.
use std::path::Path;
use anyhow::Context;
use meilisearch_types::{
heed::{types::Str, Database, EnvOpenOptions},
milli::index::db_name,
};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.10.0 to v1.11.0");
let index_scheduler_path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
let sched_rtxn = env.read_txn()?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &sched_rtxn, "index-mapping")?;
let index_count =
index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?;
let indexes: Vec<_> = index_mapping
.iter(&sched_rtxn)?
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
.collect();
for (index_index, result) in indexes.into_iter().enumerate() {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Updating embeddings for `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let index_rtxn = index_env.read_txn().with_context(|| {
format!(
"while obtaining a read transaction for index {uid} at {}",
index_path.display()
)
})?;
let index_read_database =
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
let mut index_wtxn = index_env.write_txn().with_context(|| {
format!(
"while obtaining a write transaction for index {uid} at {}",
index_path.display()
)
})?;
let index_write_database =
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
arroy_v04_to_v05::ugrade_from_prev_version(
&index_rtxn,
index_read_database,
&mut index_wtxn,
index_write_database,
)?;
index_wtxn.commit()?;
}
Ok(())
}

View File

@ -1,158 +0,0 @@
use serde::{Deserialize, Serialize};
use time::{Date, OffsetDateTime, Time, UtcOffset};
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
pub created_at: LegacyDateTime,
/// Date of the last update of the index.
pub updated_at: LegacyDateTime,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
}
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
pub struct EmbeddingConfig {
/// Options of the embedder, specific to each kind of embedder
pub embedder_options: EmbedderOptions,
}
/// Options of an embedder, specific to each kind of embedder.
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub enum EmbedderOptions {
HuggingFace(hf::EmbedderOptions),
OpenAi(openai::EmbedderOptions),
Ollama(ollama::EmbedderOptions),
UserProvided(manual::EmbedderOptions),
Rest(rest::EmbedderOptions),
}
impl Default for EmbedderOptions {
fn default() -> Self {
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
}
}
mod hf {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub model: String,
pub revision: Option<String>,
}
}
mod openai {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
}
}
mod ollama {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub embedding_model: String,
pub url: Option<String>,
pub api_key: Option<String>,
}
}
mod manual {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub dimensions: usize,
}
}
mod rest {
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
pub url: String,
pub input_field: Vec<String>,
// path to the array of embeddings
pub path_to_embeddings: Vec<String>,
// shape of a single embedding
pub embedding_object: Vec<String>,
}
}
/// A datetime from Meilisearch v1.9 with an unspecified format.
#[derive(Debug)]
pub struct LegacyDateTime(pub OffsetDateTime);
impl<'de> Deserialize<'de> for LegacyDateTime {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = OffsetDateTime;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(formatter, "a valid datetime")
}
// Comes from a binary. The legacy format is:
// 2024-11-04 13:32:08.48368 +00:00:00
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]");
OffsetDateTime::parse(v, format).map_err(E::custom)
}
// Comes from the docker image, the legacy format is:
// [2024, 309, 17, 15, 1, 698184971, 0,0,0]
// year, day in year, hour, minute, sec, subsec , offset stuff
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let mut vec = Vec::new();
// We must deserialize the value as `i64` because the largest values are `u32` and `i32`
while let Some(el) = seq.next_element::<i64>()? {
vec.push(el);
}
if vec.len() != 9 {
return Err(serde::de::Error::custom(format!(
"Invalid datetime, received an array of {} elements instead of 9",
vec.len()
)));
}
Ok(OffsetDateTime::new_in_offset(
Date::from_ordinal_date(vec[0] as i32, vec[1] as u16)
.map_err(serde::de::Error::custom)?,
Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32)
.map_err(serde::de::Error::custom)?,
UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8)
.map_err(serde::de::Error::custom)?,
))
}
}
deserializer.deserialize_any(Visitor).map(LegacyDateTime)
}
}

View File

@ -15,7 +15,7 @@ license.workspace = true
bimap = { version = "0.6.3", features = ["serde"] } bimap = { version = "0.6.3", features = ["serde"] }
bincode = "1.3.3" bincode = "1.3.3"
bstr = "1.9.1" bstr = "1.9.1"
bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
byteorder = "1.5.0" byteorder = "1.5.0"
charabia = { version = "0.9.1", default-features = false } charabia = { version = "0.9.1", default-features = false }
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
@ -79,8 +79,8 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
] } ] }
tiktoken-rs = "0.5.9" tiktoken-rs = "0.5.9"
liquid = "0.26.6" liquid = "0.26.6"
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
arroy = "0.5.0" arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.40" tracing = "0.1.40"
ureq = { version = "2.10.0", features = ["json"] } ureq = { version = "2.10.0", features = ["json"] }
@ -98,7 +98,16 @@ rand = { version = "0.8.5", features = ["small_rng"] }
[features] [features]
all-tokenizations = [ all-tokenizations = [
"charabia/default", "charabia/chinese",
"charabia/hebrew",
"charabia/japanese",
"charabia/thai",
"charabia/korean",
"charabia/greek",
"charabia/khmer",
"charabia/vietnamese",
"charabia/swedish-recomposition",
"charabia/german-segmentation",
] ]
# Use POSIX semaphores instead of SysV semaphores in LMDB # Use POSIX semaphores instead of SysV semaphores in LMDB
@ -137,8 +146,5 @@ german = ["charabia/german-segmentation"]
# force swedish character recomposition # force swedish character recomposition
swedish-recomposition = ["charabia/swedish-recomposition"] swedish-recomposition = ["charabia/swedish-recomposition"]
# allow turkish specialized tokenization
turkish = ["charabia/turkish"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306> # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"] cuda = ["candle-core/cuda"]

View File

@ -298,7 +298,6 @@ impl From<arroy::Error> for Error {
arroy::Error::InvalidVecDimension { expected, received } => { arroy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
} }
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
arroy::Error::DatabaseFull arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend | arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. } | arroy::Error::UnmatchingDistance { .. }

View File

@ -1610,6 +1610,24 @@ impl Index {
.unwrap_or_default()) .unwrap_or_default())
} }
pub fn arroy_readers<'a>(
&'a self,
rtxn: &'a RoTxn<'a>,
embedder_id: u8,
quantized: bool,
) -> impl Iterator<Item = Result<ArroyWrapper>> + 'a {
crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| {
let reader = ArroyWrapper::new(self.vector_arroy, k, quantized);
// Here we don't care about the dimensions, but we want to know if we can read
// in the database or if its metadata are missing because there is no document with that many vectors.
match reader.dimensions(rtxn) {
Ok(_) => Some(Ok(reader)),
Err(arroy::Error::MissingMetadata(_)) => None,
Err(e) => Some(Err(e.into())),
}
})
}
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
} }
@ -1631,9 +1649,14 @@ impl Index {
let embedding_configs = self.embedding_configs(rtxn)?; let embedding_configs = self.embedding_configs(rtxn)?;
for config in embedding_configs { for config in embedding_configs {
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
let reader = let embeddings = self
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); .arroy_readers(rtxn, embedder_id, config.config.quantized())
let embeddings = reader.item_vectors(rtxn, docid)?; .map_while(|reader| {
reader
.and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into()))
.transpose()
})
.collect::<Result<Vec<_>>>()?;
res.insert(config.name.to_owned(), embeddings); res.insert(config.name.to_owned(), embeddings);
} }
Ok(res) Ok(res)

View File

@ -1,139 +0,0 @@
use super::matching_words::WordId;
use super::{Match, MatchPosition};
struct MatchIntervalWithScore {
interval: [usize; 2],
score: [i16; 3],
}
// count score for phrases
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
let words_in_phrase_minus_one = (lwp - fwp) as i16;
// will always be ordered, so +1 for each space between words
*order_score += words_in_phrase_minus_one;
// distance will always be 1, so -1 for each space between words
*distance_score -= words_in_phrase_minus_one;
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
let m_last_word_pos = match m.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
lwp
}
};
let next_match_first_word_pos = next_match.get_first_word_pos();
// compute distance between matches
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
// in case last match is a phrase, count score for its words
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
}
ids.extend(m.ids.iter());
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
[uniq_score, distance_score, order_score]
}
/// Returns the first and last match where the score computed by match_interval_score is the best.
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
if matches.is_empty() {
panic!("`matches` should not be empty at this point");
}
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval: Option<MatchIntervalWithScore> = None;
let mut save_best_interval = |interval_first, interval_last| {
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
let is_interval_score_better = &best_interval
.as_ref()
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
if *is_interval_score_better {
best_interval = Some(MatchIntervalWithScore {
interval: [interval_first, interval_last],
score: interval_score,
});
}
};
// we compute the matches interval if we have at least 2 matches.
// current interval positions.
let mut interval_first = 0;
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
for (index, next_match) in matches.iter().enumerate() {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
// if index is 0 there is no last viable match
if index != 0 {
let interval_last = index - 1;
// keep interval if it's the best
save_best_interval(interval_first, interval_last);
}
// advance start of the interval while interval is longer than crop_size.
loop {
interval_first += 1;
if interval_first == matches.len() {
interval_first -= 1;
break;
}
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
if interval_first_match_first_word_pos > next_match_last_word_pos
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
{
break;
}
}
}
}
// compute the last interval score and compare it to the best one.
let interval_last = matches.len() - 1;
// if it's the last match with itself, we need to make sure it's
// not a phrase longer than the crop window
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
save_best_interval(interval_first, interval_last);
}
// if none of the matches fit the criteria above, default to the first one
best_interval.map_or(
[&matches[0], &matches[0]],
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
)
}

View File

@ -1,62 +0,0 @@
use super::matching_words::WordId;
#[derive(Clone, Debug)]
pub enum MatchPosition {
Word {
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
},
Phrase {
// position of the first and last word in the phrase in the whole text.
word_positions: [usize; 2],
// position of the first and last token in the phrase in the whole text.
token_positions: [usize; 2],
},
}
#[derive(Clone, Debug)]
pub struct Match {
pub char_count: usize,
// ids of the query words that matches.
pub ids: Vec<WordId>,
pub position: MatchPosition,
}
impl Match {
pub(super) fn get_first_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
}
}
pub(super) fn get_last_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
}
}
pub(super) fn get_first_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
}
}
pub(super) fn get_last_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
}
}
pub(super) fn get_word_count(&self) -> usize {
match self.position {
MatchPosition::Word { .. } => 1,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
}
}
}

View File

@ -86,17 +86,14 @@ impl MatchingWords {
continue; continue;
}; };
let prefix_length = char_index + c.len_utf8(); let prefix_length = char_index + c.len_utf8();
let (char_count, byte_len) = token.original_lengths(prefix_length); let char_len = token.original_lengths(prefix_length).0;
let ids = &located_words.positions; let ids = &located_words.positions;
return Some(MatchType::Full { ids, char_count, byte_len }); return Some(MatchType::Full { char_len, ids });
// else we exact match the token. // else we exact match the token.
} else if token.lemma() == word { } else if token.lemma() == word {
let char_len = token.char_end - token.char_start;
let ids = &located_words.positions; let ids = &located_words.positions;
return Some(MatchType::Full { return Some(MatchType::Full { char_len, ids });
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
});
} }
} }
} }
@ -133,7 +130,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> {
word.map(|word| self.matching_words.word_interner.get(word).as_str()) word.map(|word| self.matching_words.word_interner.get(word).as_str())
}) })
.collect(); .collect();
let partial = PartialMatch { matching_words: words, ids }; let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
partial.match_token(self.token).or_else(|| self.next()) partial.match_token(self.token).or_else(|| self.next())
} }
@ -152,7 +149,7 @@ pub type WordId = u16;
/// In these cases we need to match consecutively several tokens to consider that the match is full. /// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum MatchType<'a> { pub enum MatchType<'a> {
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> }, Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
Partial(PartialMatch<'a>), Partial(PartialMatch<'a>),
} }
@ -161,6 +158,7 @@ pub enum MatchType<'a> {
pub struct PartialMatch<'a> { pub struct PartialMatch<'a> {
matching_words: Vec<Option<&'a str>>, matching_words: Vec<Option<&'a str>>,
ids: &'a RangeInclusive<WordId>, ids: &'a RangeInclusive<WordId>,
char_len: usize,
} }
impl<'a> PartialMatch<'a> { impl<'a> PartialMatch<'a> {
@ -178,24 +176,25 @@ impl<'a> PartialMatch<'a> {
None => token.is_stopword(), None => token.is_stopword(),
}; };
let char_len = token.char_end - token.char_start;
// if there are remaining words to match in the phrase and the current token is matching, // if there are remaining words to match in the phrase and the current token is matching,
// return a new Partial match allowing the highlighter to continue. // return a new Partial match allowing the highlighter to continue.
if is_matching && matching_words.len() > 1 { if is_matching && matching_words.len() > 1 {
matching_words.remove(0); matching_words.remove(0);
Some(MatchType::Partial(Self { matching_words, ids })) Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
// if there is no remaining word to match in the phrase and the current token is matching, // if there is no remaining word to match in the phrase and the current token is matching,
// return a Full match. // return a Full match.
} else if is_matching { } else if is_matching {
Some(MatchType::Full { Some(MatchType::Full { char_len, ids })
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
})
// if the current token doesn't match, return None to break the match sequence. // if the current token doesn't match, return None to break the match sequence.
} else { } else {
None None
} }
} }
pub fn char_len(&self) -> usize {
self.char_len
}
} }
impl fmt::Debug for MatchingWords { impl fmt::Debug for MatchingWords {
@ -277,7 +276,7 @@ pub(crate) mod tests {
..Default::default() ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
); );
assert_eq!( assert_eq!(
matching_words matching_words
@ -301,7 +300,7 @@ pub(crate) mod tests {
..Default::default() ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
); );
assert_eq!( assert_eq!(
matching_words matching_words
@ -313,7 +312,7 @@ pub(crate) mod tests {
..Default::default() ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
); );
assert_eq!( assert_eq!(
matching_words matching_words

View File

@ -1,19 +1,11 @@
mod best_match_interval; use std::borrow::Cow;
mod r#match;
mod matching_words;
mod simple_token_kind;
use charabia::{Language, SeparatorKind, Token, Tokenizer}; use charabia::{Language, SeparatorKind, Token, Tokenizer};
use either::Either;
pub use matching_words::MatchingWords; pub use matching_words::MatchingWords;
use matching_words::{MatchType, PartialMatch}; use matching_words::{MatchType, PartialMatch, WordId};
use r#match::{Match, MatchPosition};
use serde::Serialize; use serde::Serialize;
use simple_token_kind::SimpleTokenKind;
use std::{ pub mod matching_words;
borrow::Cow,
cmp::{max, min},
};
const DEFAULT_CROP_MARKER: &str = ""; const DEFAULT_CROP_MARKER: &str = "";
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>"; const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
@ -101,6 +93,17 @@ impl FormatOptions {
} }
} }
#[derive(Clone, Debug)]
pub struct Match {
match_len: usize,
// ids of the query words that matches.
ids: Vec<WordId>,
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
}
#[derive(Serialize, Debug, Clone, PartialEq, Eq)] #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
pub struct MatchBounds { pub struct MatchBounds {
pub start: usize, pub start: usize,
@ -127,27 +130,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
/// compute_partial_match peek into next words to validate if the match is complete. /// compute_partial_match peek into next words to validate if the match is complete.
fn compute_partial_match<'a>( fn compute_partial_match<'a>(
mut partial: PartialMatch<'a>, mut partial: PartialMatch<'a>,
first_token_position: usize, token_position: usize,
first_word_position: usize, word_position: usize,
first_word_char_start: &usize,
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>, words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
matches: &mut Vec<Match>, matches: &mut Vec<Match>,
) -> bool { ) -> bool {
let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
for (token_position, word_position, word) in words_positions { for (token_position, word_position, word) in words_positions {
partial = match partial.match_token(word) { partial = match partial.match_token(word) {
// token matches the partial match, but the match is not full, // token matches the partial match, but the match is not full,
// we temporarily save the current token then we try to match the next one. // we temporarily save the current token then we try to match the next one.
Some(MatchType::Partial(partial)) => partial, Some(MatchType::Partial(partial)) => {
potential_matches.push((token_position, word_position, partial.char_len()));
partial
}
// partial match is now full, we keep this matches and we advance positions // partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { ids, .. }) => { Some(MatchType::Full { char_len, ids }) => {
let ids: Vec<_> = ids.clone().collect();
// save previously matched tokens as matches.
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| Match {
match_len,
ids: ids.clone(),
word_position,
token_position,
},
);
matches.extend(iter);
// save the token that closes the partial match as a match. // save the token that closes the partial match as a match.
matches.push(Match { matches.push(Match {
char_count: word.char_end - *first_word_char_start, match_len: char_len,
ids: ids.clone().collect(), ids,
position: MatchPosition::Phrase { word_position,
word_positions: [first_word_position, word_position], token_position,
token_positions: [first_token_position, token_position],
},
}); });
// the match is complete, we return true. // the match is complete, we return true.
@ -185,12 +202,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
match match_type { match match_type {
// we match, we save the current token as a match, // we match, we save the current token as a match,
// then we continue the rest of the tokens. // then we continue the rest of the tokens.
MatchType::Full { ids, char_count, .. } => { MatchType::Full { char_len, ids } => {
let ids: Vec<_> = ids.clone().collect(); let ids: Vec<_> = ids.clone().collect();
matches.push(Match { matches.push(Match {
char_count, match_len: char_len,
ids, ids,
position: MatchPosition::Word { word_position, token_position }, word_position,
token_position,
}); });
break; break;
} }
@ -203,7 +221,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
partial, partial,
token_position, token_position,
word_position, word_position,
&word.char_start,
&mut wp, &mut wp,
&mut matches, &mut matches,
) { ) {
@ -226,99 +243,56 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
Some((tokens, matches)) => matches Some((tokens, matches)) => matches
.iter() .iter()
.map(|m| MatchBounds { .map(|m| MatchBounds {
start: tokens[m.get_first_token_pos()].byte_start, start: tokens[m.token_position].byte_start,
// TODO: Why is this in chars, while start is in bytes? length: m.match_len,
length: m.char_count,
}) })
.collect(), .collect(),
} }
} }
/// Returns the bounds in byte index of the crop window. /// Returns the bounds in byte index of the crop window.
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { fn crop_bounds(
let ( &self,
mut remaining_words, tokens: &[Token<'_>],
is_iterating_forward, matches: &[Match],
before_tokens_starting_index, crop_size: usize,
after_tokens_starting_index, ) -> (usize, usize) {
) = if !matches.is_empty() { // if there is no match, we start from the beginning of the string by default.
let [matches_first, matches_last] = let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
best_match_interval::find_best_match_interval(matches, crop_size); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
let matches_size = // matches needs to be counted in the crop len.
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
let is_crop_size_gte_match_size = crop_size >= matches_size;
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
let remaining_words = if is_crop_size_gte_match_size {
crop_size - matches_size
} else {
// in case matches size is greater than crop size, which implies there's only one match,
// we count words backwards, because we have to remove words, as they're extra words outside of
// crop window
matches_size - crop_size
};
let after_tokens_starting_index = if matches_size == 0 {
0
} else {
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
if last_match_last_token_position_plus_one < tokens.len() {
last_match_last_token_position_plus_one
} else {
// we have matched the end of possible tokens, there's nothing to advance
tokens.len() - 1
}
};
(
remaining_words,
is_iterating_forward,
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
after_tokens_starting_index,
)
} else {
(crop_size, true, 0, 0)
};
// create the initial state of the crop window: 2 iterators starting from the matches positions, // create the initial state of the crop window: 2 iterators starting from the matches positions,
// a reverse iterator starting from the first match token position and going towards the beginning of the text, // a reverse iterator starting from the first match token position and going towards the beginning of the text,
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
// an iterator ... // an iterator starting from the last match token position and going towards the end of the text.
let mut after_tokens = if is_iterating_forward { let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
// ... starting from the last match token position and going towards the end of the text.
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
} else {
// ... starting from the last match token position and going towards the start of the text.
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
};
// grows the crop window peeking in both directions // grows the crop window peeking in both directions
// until the window contains the good number of words: // until the window contains the good number of words:
while remaining_words > 0 { while remaining_words > 0 {
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); let before_token = before_tokens.peek().map(|t| t.separator_kind());
let after_token_kind = let after_token = after_tokens.peek().map(|t| t.separator_kind());
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
match (before_token_kind, after_token_kind) { match (before_token, after_token) {
// we can expand both sides. // we can expand both sides.
(Some(before_token_kind), Some(after_token_kind)) => { (Some(before_token), Some(after_token)) => {
match (before_token_kind, after_token_kind) { match (before_token, after_token) {
// if they are both separators and are the same kind then advance both, // if they are both separators and are the same kind then advance both,
// or expand in the soft separator separator side. // or expand in the soft separator separator side.
( (Some(before_token_kind), Some(after_token_kind)) => {
SimpleTokenKind::Separator(before_token_separator_kind), if before_token_kind == after_token_kind {
SimpleTokenKind::Separator(after_token_separator_kind),
) => {
if before_token_separator_kind == after_token_separator_kind {
before_tokens.next(); before_tokens.next();
// this avoid having an ending separator before crop marker. // this avoid having an ending separator before crop marker.
if remaining_words > 1 { if remaining_words > 1 {
after_tokens.next(); after_tokens.next();
} }
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) { } else if before_token_kind == SeparatorKind::Hard {
after_tokens.next(); after_tokens.next();
} else { } else {
before_tokens.next(); before_tokens.next();
@ -326,17 +300,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
} }
// if one of the tokens is a word, we expend in the side of the word. // if one of the tokens is a word, we expend in the side of the word.
// left is a word, advance left. // left is a word, advance left.
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { (None, Some(_)) => {
before_tokens.next(); before_tokens.next();
remaining_words -= 1; remaining_words -= 1;
} }
// right is a word, advance right. // right is a word, advance right.
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { (Some(_), None) => {
after_tokens.next(); after_tokens.next();
remaining_words -= 1; remaining_words -= 1;
} }
// both are words, advance left then right if remaining_word > 0. // both are words, advance left then right if remaining_word > 0.
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { (None, None) => {
before_tokens.next(); before_tokens.next();
remaining_words -= 1; remaining_words -= 1;
@ -348,16 +322,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
} }
} }
// the end of the text is reached, advance left. // the end of the text is reached, advance left.
(Some(before_token_kind), None) => { (Some(before_token), None) => {
before_tokens.next(); before_tokens.next();
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { if before_token.is_none() {
remaining_words -= 1; remaining_words -= 1;
} }
} }
// the start of the text is reached, advance right. // the start of the text is reached, advance right.
(None, Some(after_token_kind)) => { (None, Some(after_token)) => {
after_tokens.next(); after_tokens.next();
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { if after_token.is_none() {
remaining_words -= 1; remaining_words -= 1;
} }
} }
@ -370,7 +344,86 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
[crop_byte_start, crop_byte_end] (crop_byte_start, crop_byte_end)
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
// compute distance between matches
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
}
ids.extend(m.ids.iter());
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
(uniq_score, distance_score, order_score)
}
/// Returns the matches interval where the score computed by match_interval_score is the best.
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
// we compute the matches interval if we have at least 2 matches.
if matches.len() > 1 {
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval = (0, 0);
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
// current interval positions.
let mut interval_first = 0;
let mut interval_last = 0;
for (index, next_match) in matches.iter().enumerate().skip(1) {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
if next_match.word_position - matches[interval_first].word_position >= crop_size {
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
best_interval_score = interval_score;
}
// advance start of the interval while interval is longer than crop_size.
while next_match.word_position - matches[interval_first].word_position
>= crop_size
{
interval_first += 1;
}
}
interval_last = index;
}
// compute the last interval score and compare it to the best one.
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
}
&matches[best_interval.0..=best_interval.1]
} else {
matches
}
} }
// Returns the formatted version of the original text. // Returns the formatted version of the original text.
@ -381,87 +434,69 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
} else { } else {
match &self.matches { match &self.matches {
Some((tokens, matches)) => { Some((tokens, matches)) => {
// If the text has to be cropped, crop around the best interval. // If the text has to be cropped,
let [crop_byte_start, crop_byte_end] = match format_options.crop { // crop around the best interval.
let (byte_start, byte_end) = match format_options.crop {
Some(crop_size) if crop_size > 0 => { Some(crop_size) if crop_size > 0 => {
let matches = self.find_best_match_interval(matches, crop_size);
self.crop_bounds(tokens, matches, crop_size) self.crop_bounds(tokens, matches, crop_size)
} }
_ => [0, self.text.len()], _ => (0, self.text.len()),
}; };
let mut formatted = Vec::new(); let mut formatted = Vec::new();
// push crop marker if it's not the start of the text. // push crop marker if it's not the start of the text.
if crop_byte_start > 0 && !self.crop_marker.is_empty() { if byte_start > 0 && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker); formatted.push(self.crop_marker);
} }
let mut byte_index = crop_byte_start; let mut byte_index = byte_start;
if format_options.highlight { if format_options.highlight {
// insert highlight markers around matches. // insert highlight markers around matches.
for m in matches { for m in matches {
let [m_byte_start, m_byte_end] = match m.position { let token = &tokens[m.token_position];
MatchPosition::Word { token_position, .. } => {
let token = &tokens[token_position];
[&token.byte_start, &token.byte_end]
}
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
}
};
// skip matches out of the crop window // skip matches out of the crop window.
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { if token.byte_start < byte_start || token.byte_end > byte_end {
continue; continue;
} }
// adjust start and end to the crop window size if byte_index < token.byte_start {
let [m_byte_start, m_byte_end] = [ formatted.push(&self.text[byte_index..token.byte_start]);
max(m_byte_start, &crop_byte_start),
min(m_byte_end, &crop_byte_end),
];
// push text that is positioned before our matches
if byte_index < *m_byte_start {
formatted.push(&self.text[byte_index..*m_byte_start]);
} }
formatted.push(self.highlight_prefix); let highlight_byte_index = self.text[token.byte_start..]
// TODO: This is additional work done, charabia::token::Token byte_len
// should already get us the original byte length, however, that doesn't work as
// it's supposed to, investigate why
let highlight_byte_index = self.text[*m_byte_start..]
.char_indices() .char_indices()
.nth(m.char_count) .enumerate()
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); .find(|(i, _)| *i == m.match_len)
formatted.push(&self.text[*m_byte_start..highlight_byte_index]); .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix); formatted.push(self.highlight_suffix);
// if it's a prefix highlight, we put the end of the word after the highlight marker. // if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < *m_byte_end { if highlight_byte_index < token.byte_end {
formatted.push(&self.text[highlight_byte_index..*m_byte_end]); formatted.push(&self.text[highlight_byte_index..token.byte_end]);
} }
byte_index = *m_byte_end; byte_index = token.byte_end;
} }
} }
// push the rest of the text between last match and the end of crop. // push the rest of the text between last match and the end of crop.
if byte_index < crop_byte_end { if byte_index < byte_end {
formatted.push(&self.text[byte_index..crop_byte_end]); formatted.push(&self.text[byte_index..byte_end]);
} }
// push crop marker if it's not the end of the text. // push crop marker if it's not the end of the text.
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { if byte_end < self.text.len() && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker); formatted.push(self.crop_marker);
} }
if formatted.len() == 1 { if formatted.len() == 1 {
// avoid concatenating if there is already 1 slice. // avoid concatenating if there is already 1 slice.
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) Cow::Borrowed(&self.text[byte_start..byte_end])
} else { } else {
Cow::Owned(formatted.concat()) Cow::Owned(formatted.concat())
} }
@ -786,24 +821,22 @@ mod tests {
fn format_highlight_crop_phrase_query() { fn format_highlight_crop_phrase_query() {
//! testing: https://github.com/meilisearch/meilisearch/issues/3975 //! testing: https://github.com/meilisearch/meilisearch/issues/3975
let temp_index = TempIndex::new(); let temp_index = TempIndex::new();
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
temp_index temp_index
.add_documents(documents!([ .add_documents(documents!([
{ "id": 1, "text": text } { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
])) ]))
.unwrap(); .unwrap();
let rtxn = temp_index.read_txn().unwrap(); let rtxn = temp_index.read_txn().unwrap();
let format_options = FormatOptions { highlight: true, crop: Some(10) }; let format_options = FormatOptions { highlight: true, crop: Some(10) };
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
let mut matcher = builder.build(text, None); let mut matcher = builder.build(text, None);
// should return 10 words with a marker at the start as well the end, and the highlighted matches. // should return 10 words with a marker at the start as well the end, and the highlighted matches.
insta::assert_snapshot!( insta::assert_snapshot!(
matcher.format(format_options), matcher.format(format_options),
@"the power to split <em>the world</em> between those who embraced" @"had the power to split <em>the</em> <em>world</em> between those who…"
); );
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
@ -811,63 +844,7 @@ mod tests {
// should highlight "those" and the phrase "and those". // should highlight "those" and the phrase "and those".
insta::assert_snapshot!( insta::assert_snapshot!(
matcher.format(format_options), matcher.format(format_options),
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…" @"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention had the power to split the world\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>The groundbreaking invention had the power to split the world</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention had the power to split the world between those\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>The groundbreaking invention had the power to split the world</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
// TODO: Should include exclamation mark without crop markers
@"…between those who <em>embraced progress and those who resisted change</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"groundbreaking invention\" \"split the world between\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"groundbreaking invention\" \"had the power to split the world between those\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
); );
} }
@ -923,7 +900,7 @@ mod tests {
let mut matcher = builder.build(text, None); let mut matcher = builder.build(text, None);
insta::assert_snapshot!( insta::assert_snapshot!(
matcher.format(format_options), matcher.format(format_options),
@"_the_ _do or_ die can't be he do and or isn'_t he_" @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
); );
} }
} }

View File

@ -1,15 +0,0 @@
use charabia::{SeparatorKind, Token, TokenKind};
pub enum SimpleTokenKind {
Separator(SeparatorKind),
NotSeparator,
}
impl SimpleTokenKind {
pub fn new(token: &&Token<'_>) -> Self {
match token.kind {
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
_ => Self::NotSeparator,
}
}
}

View File

@ -1,10 +1,11 @@
use std::iter::FromIterator; use std::iter::FromIterator;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; use crate::vector::{DistributionShift, Embedder};
use crate::{DocumentId, Result, SearchContext, SearchLogger}; use crate::{DocumentId, Result, SearchContext, SearchLogger};
pub struct VectorSort<Q: RankingRuleQueryTrait> { pub struct VectorSort<Q: RankingRuleQueryTrait> {
@ -52,9 +53,14 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
vector_candidates: &RoaringBitmap, vector_candidates: &RoaringBitmap,
) -> Result<()> { ) -> Result<()> {
let target = &self.target; let target = &self.target;
let mut results = Vec::new();
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) {
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; let nns_by_vector =
reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
results.extend(nns_by_vector.into_iter());
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
self.cached_sorted_docids = results.into_iter(); self.cached_sorted_docids = results.into_iter();
Ok(()) Ok(())

View File

@ -1,9 +1,10 @@
use std::sync::Arc; use std::sync::Arc;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, Embedder}; use crate::vector::Embedder;
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
pub struct Similar<'a> { pub struct Similar<'a> {
@ -70,13 +71,23 @@ impl<'a> Similar<'a> {
.get(self.rtxn, &self.embedder_name)? .get(self.rtxn, &self.embedder_name)?
.ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?; .ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?;
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let mut results = Vec::new();
let results = reader.nns_by_item(
for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) {
let nns_by_item = reader?.nns_by_item(
self.rtxn, self.rtxn,
self.id, self.id,
self.limit + self.offset + 1, self.limit + self.offset + 1,
Some(&universe), Some(&universe),
)?; )?;
if let Some(mut nns_by_item) = nns_by_item {
results.append(&mut nns_by_item);
} else {
break;
}
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
let mut documents_ids = Vec::with_capacity(self.limit); let mut documents_ids = Vec::with_capacity(self.limit);
let mut document_scores = Vec::with_capacity(self.limit); let mut document_scores = Vec::with_capacity(self.limit);

View File

@ -689,8 +689,9 @@ where
key: None, key: None,
}, },
)?; )?;
let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap();
let reader = let reader =
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized);
let dim = reader.dimensions(self.wtxn)?; let dim = reader.dimensions(self.wtxn)?;
dimension.insert(name.to_string(), dim); dimension.insert(name.to_string(), dim);
} }
@ -699,7 +700,6 @@ where
for (embedder_name, dimension) in dimension { for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn; let wtxn = &mut *self.wtxn;
let vector_arroy = self.index.vector_arroy; let vector_arroy = self.index.vector_arroy;
let cancel = &self.should_abort;
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
@ -713,8 +713,17 @@ where
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized); let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
pool.install(|| { pool.install(|| {
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized);
if is_quantizing {
writer.quantize(wtxn, k, dimension)?;
}
if writer.need_build(wtxn, dimension)? {
writer.build(wtxn, &mut rng, dimension)?;
} else if writer.is_empty(wtxn, dimension)? {
break;
}
}
Result::Ok(()) Result::Ok(())
}) })
.map_err(InternalError::from)??; .map_err(InternalError::from)??;

View File

@ -990,24 +990,27 @@ impl<'a, 'i> Transform<'a, 'i> {
None None
}; };
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff let readers: Result<BTreeMap<&str, (Vec<ArroyWrapper>, &RoaringBitmap)>> = settings_diff
.embedding_config_updates .embedding_config_updates
.iter() .iter()
.filter_map(|(name, action)| { .filter_map(|(name, action)| {
if let Some(WriteBackToDocuments { embedder_id, user_provided }) = if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
action.write_back() action.write_back()
{ {
let reader = ArroyWrapper::new( let readers: Result<Vec<_>> = self
self.index.vector_arroy, .index
*embedder_id, .arroy_readers(wtxn, *embedder_id, action.was_quantized)
action.was_quantized, .collect();
); match readers {
Some((name.as_str(), (reader, user_provided))) Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
Err(error) => Some(Err(error)),
}
} else { } else {
None None
} }
}) })
.collect(); .collect();
let readers = readers?;
let old_vectors_fid = settings_diff let old_vectors_fid = settings_diff
.old .old
@ -1045,24 +1048,34 @@ impl<'a, 'i> Transform<'a, 'i> {
arroy::Error, arroy::Error,
> = readers > = readers
.iter() .iter()
.filter_map(|(name, (reader, user_provided))| { .filter_map(|(name, (readers, user_provided))| {
if !user_provided.contains(docid) { if !user_provided.contains(docid) {
return None; return None;
} }
match reader.item_vectors(wtxn, docid) { let mut vectors = Vec::new();
Ok(vectors) if vectors.is_empty() => None, for reader in readers {
Ok(vectors) => Some(Ok(( let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
break;
};
match vector {
Ok(vector) => vectors.push(vector),
Err(error) => return Some(Err(error)),
}
}
if vectors.is_empty() {
return None;
}
Some(Ok((
name.to_string(), name.to_string(),
serde_json::to_value(ExplicitVectors { serde_json::to_value(ExplicitVectors {
embeddings: Some( embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
VectorOrArrayOfVectors::from_array_of_vectors(vectors), vectors,
), )),
regenerate: false, regenerate: false,
}) })
.unwrap(), .unwrap(),
))), )))
Err(e) => Some(Err(e)),
}
}) })
.collect(); .collect();
@ -1091,10 +1104,12 @@ impl<'a, 'i> Transform<'a, 'i> {
} }
// delete all vectors from the embedders that need removal // delete all vectors from the embedders that need removal
for (_, (reader, _)) in readers { for (_, (readers, _)) in readers {
for reader in readers {
let dimensions = reader.dimensions(wtxn)?; let dimensions = reader.dimensions(wtxn)?;
reader.clear(wtxn, dimensions)?; reader.clear(wtxn, dimensions)?;
} }
}
let grenad_params = GrenadParameters { let grenad_params = GrenadParameters {
chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_type: self.indexer_settings.chunk_compression_type,

View File

@ -673,14 +673,22 @@ pub(crate) fn write_typed_chunk_into_index(
.get(&embedder_name) .get(&embedder_name)
.map_or(false, |conf| conf.2); .map_or(false, |conf| conf.2);
// FIXME: allow customizing distance // FIXME: allow customizing distance
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
.map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized))
.collect();
// remove vectors for docids we want them removed // remove vectors for docids we want them removed
let merger = remove_vectors_builder.build(); let merger = remove_vectors_builder.build();
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, _)) = iter.next()? { while let Some((key, _)) = iter.next()? {
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
writer.del_items(wtxn, expected_dimension, docid)?;
for writer in &writers {
// Uses invariant: vectors are packed in the first writers.
if !writer.del_item(wtxn, expected_dimension, docid)? {
break;
}
}
} }
// add generated embeddings // add generated embeddings
@ -708,7 +716,9 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings.embedding_count(), embeddings.embedding_count(),
))); )));
} }
writer.add_items(wtxn, docid, &embeddings)?; for (embedding, writer) in embeddings.iter().zip(&writers) {
writer.add_item(wtxn, expected_dimension, docid, embedding)?;
}
} }
// perform the manual diff // perform the manual diff
@ -723,14 +733,51 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
let vector: Vec<f32> = pod_collect_to_vec(value); let vector: Vec<f32> = pod_collect_to_vec(value);
writer.del_item(wtxn, docid, &vector)?; let mut deleted_index = None;
for (index, writer) in writers.iter().enumerate() {
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
// uses invariant: vectors are packed in the first writers.
break;
};
if candidate == vector {
writer.del_item(wtxn, expected_dimension, docid)?;
deleted_index = Some(index);
}
}
// 🥲 enforce invariant: vectors are packed in the first writers.
if let Some(deleted_index) = deleted_index {
let mut last_index_with_a_vector = None;
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
break;
};
last_index_with_a_vector = Some((index, candidate));
}
if let Some((last_index, vector)) = last_index_with_a_vector {
// unwrap: computed the index from the list of writers
let writer = writers.get(last_index).unwrap();
writer.del_item(wtxn, expected_dimension, docid)?;
writers.get(deleted_index).unwrap().add_item(
wtxn,
expected_dimension,
docid,
&vector,
)?;
}
}
} }
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
let vector = pod_collect_to_vec(value); let vector = pod_collect_to_vec(value);
// overflow was detected during vector extraction. // overflow was detected during vector extraction.
writer.add_item(wtxn, docid, &vector)?; for writer in &writers {
if !writer.contains_item(wtxn, expected_dimension, docid)? {
writer.add_item(wtxn, expected_dimension, docid, &vector)?;
break;
}
}
} }
} }

View File

@ -1,7 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::distances::{Angular, BinaryQuantizedAngular};
use arroy::ItemId; use arroy::ItemId;
use deserr::{DeserializeError, Deserr}; use deserr::{DeserializeError, Deserr};
use heed::{RoTxn, RwTxn, Unspecified}; use heed::{RoTxn, RwTxn, Unspecified};
@ -32,243 +32,105 @@ pub const REQUEST_PARALLELISM: usize = 40;
pub struct ArroyWrapper { pub struct ArroyWrapper {
quantized: bool, quantized: bool,
embedder_index: u8, index: u16,
database: arroy::Database<Unspecified>, database: arroy::Database<Unspecified>,
} }
impl ArroyWrapper { impl ArroyWrapper {
pub fn new( pub fn new(database: arroy::Database<Unspecified>, index: u16, quantized: bool) -> Self {
database: arroy::Database<Unspecified>, Self { database, index, quantized }
embedder_index: u8,
quantized: bool,
) -> Self {
Self { database, embedder_index, quantized }
} }
pub fn embedder_index(&self) -> u8 { pub fn index(&self) -> u16 {
self.embedder_index self.index
}
fn readers<'a, D: arroy::Distance>(
&'a self,
rtxn: &'a RoTxn<'a>,
db: arroy::Database<D>,
) -> impl Iterator<Item = Result<arroy::Reader<D>, arroy::Error>> + 'a {
arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| {
match arroy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)),
Ok(true) => None,
Err(e) => Some(Err(e)),
},
Err(arroy::Error::MissingMetadata(_)) => None,
Err(e) => Some(Err(e)),
}
})
} }
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> { pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> {
let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap();
if self.quantized { if self.quantized {
Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions())
} else { } else {
Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions())
} }
} }
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>( pub fn quantize(
&mut self, &mut self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
index: u16,
dimension: usize,
) -> Result<(), arroy::Error> {
if !self.quantized {
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
writer.prepare_changing_distance::<BinaryQuantizedAngular>(wtxn)?;
self.quantized = true;
}
Ok(())
}
pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result<bool, arroy::Error> {
if self.quantized {
arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn)
} else {
arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn)
}
}
pub fn build<R: rand::Rng + rand::SeedableRng>(
&self,
wtxn: &mut RwTxn,
rng: &mut R, rng: &mut R,
dimension: usize, dimension: usize,
quantizing: bool,
cancel: &(impl Fn() -> bool + Sync + Send),
) -> Result<(), arroy::Error> { ) -> Result<(), arroy::Error> {
for index in arroy_db_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None)
if writer.need_build(wtxn)? {
writer.builder(rng).build(wtxn)?
} else if writer.is_empty(wtxn)? {
break;
}
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None)
// If we are quantizing the databases, we can't know from meilisearch
// if the db was empty but still contained the wrong metadata, thus we need
// to quantize everything and can't stop early. Since this operation can
// only happens once in the life of an embedder, it's not very performances
// sensitive.
if quantizing && !self.quantized {
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
writer.builder(rng).cancel(cancel).build(wtxn)?;
} else if writer.need_build(wtxn)? {
writer.builder(rng).cancel(cancel).build(wtxn)?;
} else if writer.is_empty(wtxn)? {
break;
} }
} }
}
Ok(())
}
/// Overwrite all the embeddings associated with the index and item ID.
/// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings.
/// You should call `del_items` on the `item_id` before calling this method.
/// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored.
pub fn add_items(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
embeddings: &Embeddings<f32>,
) -> Result<(), arroy::Error> {
let dimension = embeddings.dimension();
for (index, vector) in
arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter())
{
if self.quantized {
arroy::Writer::new(self.quantized_db(), index, dimension)
.add_item(wtxn, item_id, vector)?
} else {
arroy::Writer::new(self.angular_db(), index, dimension)
.add_item(wtxn, item_id, vector)?
}
}
Ok(())
}
/// Add one document int for this index where we can find an empty spot.
pub fn add_item( pub fn add_item(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
vector: &[f32],
) -> Result<(), arroy::Error> {
if self.quantized {
self._add_item(wtxn, self.quantized_db(), item_id, vector)
} else {
self._add_item(wtxn, self.angular_db(), item_id, vector)
}
}
fn _add_item<D: arroy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
vector: &[f32],
) -> Result<(), arroy::Error> {
let dimension = vector.len();
for index in arroy_db_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension);
if !writer.contains_item(wtxn, item_id)? {
writer.add_item(wtxn, item_id, vector)?;
break;
}
}
Ok(())
}
/// Delete all embeddings from a specific `item_id`
pub fn del_items(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
dimension: usize, dimension: usize,
item_id: arroy::ItemId, item_id: arroy::ItemId,
vector: &[f32],
) -> Result<(), arroy::Error> { ) -> Result<(), arroy::Error> {
for index in arroy_db_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); arroy::Writer::new(self.quantized_db(), self.index, dimension)
if !writer.del_item(wtxn, item_id)? { .add_item(wtxn, item_id, vector)
break;
}
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); arroy::Writer::new(self.angular_db(), self.index, dimension)
if !writer.del_item(wtxn, item_id)? { .add_item(wtxn, item_id, vector)
break;
}
} }
} }
Ok(())
}
/// Delete one item.
pub fn del_item( pub fn del_item(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
dimension: usize,
item_id: arroy::ItemId, item_id: arroy::ItemId,
vector: &[f32],
) -> Result<bool, arroy::Error> { ) -> Result<bool, arroy::Error> {
if self.quantized { if self.quantized {
self._del_item(wtxn, self.quantized_db(), item_id, vector) arroy::Writer::new(self.quantized_db(), self.index, dimension).del_item(wtxn, item_id)
} else { } else {
self._del_item(wtxn, self.angular_db(), item_id, vector) arroy::Writer::new(self.angular_db(), self.index, dimension).del_item(wtxn, item_id)
} }
} }
fn _del_item<D: arroy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
vector: &[f32],
) -> Result<bool, arroy::Error> {
let dimension = vector.len();
let mut deleted_index = None;
for index in arroy_db_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension);
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
// uses invariant: vectors are packed in the first writers.
break;
};
if candidate == vector {
writer.del_item(wtxn, item_id)?;
deleted_index = Some(index);
}
}
// 🥲 enforce invariant: vectors are packed in the first writers.
if let Some(deleted_index) = deleted_index {
let mut last_index_with_a_vector = None;
for index in
arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize)
{
let writer = arroy::Writer::new(db, index, dimension);
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
break;
};
last_index_with_a_vector = Some((index, candidate));
}
if let Some((last_index, vector)) = last_index_with_a_vector {
let writer = arroy::Writer::new(db, last_index, dimension);
writer.del_item(wtxn, item_id)?;
let writer = arroy::Writer::new(db, deleted_index, dimension);
writer.add_item(wtxn, item_id, &vector)?;
}
}
Ok(deleted_index.is_some())
}
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
for index in arroy_db_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); arroy::Writer::new(self.quantized_db(), self.index, dimension).clear(wtxn)
if writer.is_empty(wtxn)? {
break;
}
writer.clear(wtxn)?;
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); arroy::Writer::new(self.angular_db(), self.index, dimension).clear(wtxn)
if writer.is_empty(wtxn)? {
break;
}
writer.clear(wtxn)?;
} }
} }
Ok(())
pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result<bool, arroy::Error> {
if self.quantized {
arroy::Writer::new(self.quantized_db(), self.index, dimension).is_empty(rtxn)
} else {
arroy::Writer::new(self.angular_db(), self.index, dimension).is_empty(rtxn)
}
} }
pub fn contains_item( pub fn contains_item(
@ -277,25 +139,11 @@ impl ArroyWrapper {
dimension: usize, dimension: usize,
item: arroy::ItemId, item: arroy::ItemId,
) -> Result<bool, arroy::Error> { ) -> Result<bool, arroy::Error> {
for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized {
let contains = if self.quantized { arroy::Writer::new(self.quantized_db(), self.index, dimension).contains_item(rtxn, item)
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(rtxn)? {
break;
}
writer.contains_item(rtxn, item)?
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); arroy::Writer::new(self.angular_db(), self.index, dimension).contains_item(rtxn, item)
if writer.is_empty(rtxn)? {
break;
} }
writer.contains_item(rtxn, item)?
};
if contains {
return Ok(contains);
}
}
Ok(false)
} }
pub fn nns_by_item( pub fn nns_by_item(
@ -304,108 +152,45 @@ impl ArroyWrapper {
item: ItemId, item: ItemId,
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Option<Vec<(ItemId, f32)>>, arroy::Error> {
if self.quantized { if self.quantized {
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) arroy::Reader::open(rtxn, self.index, self.quantized_db())?
.nns_by_item(rtxn, item, limit, None, None, filter)
} else { } else {
self._nns_by_item(rtxn, self.angular_db(), item, limit, filter) arroy::Reader::open(rtxn, self.index, self.angular_db())?
.nns_by_item(rtxn, item, limit, None, None, filter)
} }
} }
fn _nns_by_item<D: arroy::Distance>(
&self,
rtxn: &RoTxn,
db: arroy::Database<D>,
item: ItemId,
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
if let Some(filter) = filter {
searcher.candidates(filter);
}
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
results.append(&mut ret);
} else {
break;
}
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
Ok(results)
}
pub fn nns_by_vector( pub fn nns_by_vector(
&self, &self,
rtxn: &RoTxn, txn: &RoTxn,
vector: &[f32], item: &[f32],
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
if self.quantized { if self.quantized {
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) arroy::Reader::open(txn, self.index, self.quantized_db())?
.nns_by_vector(txn, item, limit, None, None, filter)
} else { } else {
self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter) arroy::Reader::open(txn, self.index, self.angular_db())?
.nns_by_vector(txn, item, limit, None, None, filter)
} }
} }
fn _nns_by_vector<D: arroy::Distance>( pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result<Option<Vec<f32>>, arroy::Error> {
&self,
rtxn: &RoTxn,
db: arroy::Database<D>,
vector: &[f32],
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
if let Some(filter) = filter {
searcher.candidates(filter);
}
results.append(&mut searcher.by_vector(rtxn, vector)?);
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
Ok(results)
}
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, arroy::Error> {
let mut vectors = Vec::new();
if self.quantized { if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) { arroy::Reader::open(rtxn, self.index, self.quantized_db())?.item_vector(rtxn, docid)
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
vectors.push(vec);
} else { } else {
break; arroy::Reader::open(rtxn, self.index, self.angular_db())?.item_vector(rtxn, docid)
} }
} }
} else {
for reader in self.readers(rtxn, self.angular_db()) {
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
vectors.push(vec);
} else {
break;
}
}
}
Ok(vectors)
}
fn angular_db(&self) -> arroy::Database<Cosine> { fn angular_db(&self) -> arroy::Database<Angular> {
self.database.remap_data_type() self.database.remap_data_type()
} }
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> { fn quantized_db(&self) -> arroy::Database<BinaryQuantizedAngular> {
self.database.remap_data_type() self.database.remap_data_type()
} }
} }

View File

@ -417,8 +417,6 @@ impl EmbeddingSettings {
pub const DISTRIBUTION: &'static str = "distribution"; pub const DISTRIBUTION: &'static str = "distribution";
pub const BINARY_QUANTIZED: &'static str = "binaryQuantized";
pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] {
match field { match field {
Self::SOURCE => &[ Self::SOURCE => &[
@ -458,13 +456,6 @@ impl EmbeddingSettings {
EmbedderSource::Rest, EmbedderSource::Rest,
EmbedderSource::UserProvided, EmbedderSource::UserProvided,
], ],
Self::BINARY_QUANTIZED => &[
EmbedderSource::HuggingFace,
EmbedderSource::Ollama,
EmbedderSource::OpenAi,
EmbedderSource::Rest,
EmbedderSource::UserProvided,
],
_other => unreachable!("unknown field"), _other => unreachable!("unknown field"),
} }
} }
@ -479,7 +470,6 @@ impl EmbeddingSettings {
Self::DIMENSIONS, Self::DIMENSIONS,
Self::DISTRIBUTION, Self::DISTRIBUTION,
Self::URL, Self::URL,
Self::BINARY_QUANTIZED,
], ],
EmbedderSource::HuggingFace => &[ EmbedderSource::HuggingFace => &[
Self::SOURCE, Self::SOURCE,
@ -487,7 +477,6 @@ impl EmbeddingSettings {
Self::REVISION, Self::REVISION,
Self::DOCUMENT_TEMPLATE, Self::DOCUMENT_TEMPLATE,
Self::DISTRIBUTION, Self::DISTRIBUTION,
Self::BINARY_QUANTIZED,
], ],
EmbedderSource::Ollama => &[ EmbedderSource::Ollama => &[
Self::SOURCE, Self::SOURCE,
@ -497,11 +486,8 @@ impl EmbeddingSettings {
Self::API_KEY, Self::API_KEY,
Self::DIMENSIONS, Self::DIMENSIONS,
Self::DISTRIBUTION, Self::DISTRIBUTION,
Self::BINARY_QUANTIZED,
], ],
EmbedderSource::UserProvided => { EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION],
&[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED]
}
EmbedderSource::Rest => &[ EmbedderSource::Rest => &[
Self::SOURCE, Self::SOURCE,
Self::API_KEY, Self::API_KEY,
@ -512,7 +498,6 @@ impl EmbeddingSettings {
Self::RESPONSE, Self::RESPONSE,
Self::HEADERS, Self::HEADERS,
Self::DISTRIBUTION, Self::DISTRIBUTION,
Self::BINARY_QUANTIZED,
], ],
} }
} }

View File

@ -77,8 +77,7 @@
"q": "puppy cute comforting movie", "q": "puppy cute comforting movie",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 0.1, "semanticRatio": 0.1
"embedder": "default"
} }
} }
}, },
@ -92,8 +91,7 @@
"q": "puppy cute comforting movie", "q": "puppy cute comforting movie",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 0.5, "semanticRatio": 0.5
"embedder": "default"
} }
} }
}, },
@ -107,8 +105,7 @@
"q": "puppy cute comforting movie", "q": "puppy cute comforting movie",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 0.9, "semanticRatio": 0.9
"embedder": "default"
} }
} }
}, },
@ -122,8 +119,7 @@
"q": "puppy cute comforting movie", "q": "puppy cute comforting movie",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 1.0, "semanticRatio": 1.0
"embedder": "default"
} }
} }
}, },
@ -137,8 +133,7 @@
"q": "shrek", "q": "shrek",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 1.0, "semanticRatio": 1.0
"embedder": "default"
} }
} }
}, },
@ -152,8 +147,7 @@
"q": "shrek", "q": "shrek",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 0.5, "semanticRatio": 0.5
"embedder": "default"
} }
} }
}, },
@ -167,8 +161,7 @@
"q": "shrek", "q": "shrek",
"limit": 100, "limit": 100,
"hybrid": { "hybrid": {
"semanticRatio": 0.1, "semanticRatio": 0.1
"embedder": "default"
} }
} }
}, },