mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Merge branch 'main' into key-actions-to-bitflags
This commit is contained in:
commit
7e45125e7a
2
.github/workflows/benchmarks-manual.yml
vendored
2
.github/workflows/benchmarks-manual.yml
vendored
@ -43,7 +43,7 @@ jobs:
|
|||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||||
run: |
|
run: |
|
||||||
cd benchmarks
|
cd crates/benchmarks
|
||||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||||
|
|
||||||
# Generate critcmp files
|
# Generate critcmp files
|
||||||
|
2
.github/workflows/benchmarks-pr.yml
vendored
2
.github/workflows/benchmarks-pr.yml
vendored
@ -88,7 +88,7 @@ jobs:
|
|||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||||
run: |
|
run: |
|
||||||
cd benchmarks
|
cd crates/benchmarks
|
||||||
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
|
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||||
|
|
||||||
# Generate critcmp files
|
# Generate critcmp files
|
||||||
|
@ -41,7 +41,7 @@ jobs:
|
|||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||||
run: |
|
run: |
|
||||||
cd benchmarks
|
cd crates/benchmarks
|
||||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||||
|
|
||||||
# Generate critcmp files
|
# Generate critcmp files
|
||||||
|
@ -40,7 +40,7 @@ jobs:
|
|||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||||
run: |
|
run: |
|
||||||
cd benchmarks
|
cd crates/benchmarks
|
||||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||||
|
|
||||||
# Generate critcmp files
|
# Generate critcmp files
|
||||||
|
@ -40,7 +40,7 @@ jobs:
|
|||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||||
run: |
|
run: |
|
||||||
cd benchmarks
|
cd crates/benchmarks
|
||||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||||
|
|
||||||
# Generate critcmp files
|
# Generate critcmp files
|
||||||
|
6
.github/workflows/publish-binaries.yml
vendored
6
.github/workflows/publish-binaries.yml
vendored
@ -65,9 +65,9 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [macos-12, windows-2022]
|
os: [macos-13, windows-2022]
|
||||||
include:
|
include:
|
||||||
- os: macos-12
|
- os: macos-13
|
||||||
artifact_name: meilisearch
|
artifact_name: meilisearch
|
||||||
asset_name: meilisearch-macos-amd64
|
asset_name: meilisearch-macos-amd64
|
||||||
- os: windows-2022
|
- os: windows-2022
|
||||||
@ -90,7 +90,7 @@ jobs:
|
|||||||
|
|
||||||
publish-macos-apple-silicon:
|
publish-macos-apple-silicon:
|
||||||
name: Publish binary for macOS silicon
|
name: Publish binary for macOS silicon
|
||||||
runs-on: macos-12
|
runs-on: macos-13
|
||||||
needs: check-version
|
needs: check-version
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
2
.github/workflows/test-suite.yml
vendored
2
.github/workflows/test-suite.yml
vendored
@ -51,7 +51,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [macos-12, windows-2022]
|
os: [macos-13, windows-2022]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
|
48
Cargo.lock
generated
48
Cargo.lock
generated
@ -386,8 +386,28 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.4.0"
|
version = "0.5.0"
|
||||||
source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e"
|
||||||
|
dependencies = [
|
||||||
|
"bytemuck",
|
||||||
|
"byteorder",
|
||||||
|
"heed",
|
||||||
|
"log",
|
||||||
|
"memmap2",
|
||||||
|
"nohash",
|
||||||
|
"ordered-float",
|
||||||
|
"rand",
|
||||||
|
"rayon",
|
||||||
|
"roaring",
|
||||||
|
"tempfile",
|
||||||
|
"thiserror",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "arroy"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@ -706,9 +726,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytemuck"
|
name = "bytemuck"
|
||||||
version = "1.16.1"
|
version = "1.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
|
checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck_derive",
|
"bytemuck_derive",
|
||||||
]
|
]
|
||||||
@ -2555,7 +2575,7 @@ name = "index-scheduler"
|
|||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arroy",
|
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bincode",
|
"bincode",
|
||||||
"crossbeam",
|
"crossbeam",
|
||||||
@ -3414,6 +3434,7 @@ dependencies = [
|
|||||||
"meilisearch-types",
|
"meilisearch-types",
|
||||||
"mimalloc",
|
"mimalloc",
|
||||||
"mime",
|
"mime",
|
||||||
|
"mopa-maintained",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"obkv",
|
"obkv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@ -3516,6 +3537,7 @@ name = "meilitool"
|
|||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)",
|
||||||
"clap",
|
"clap",
|
||||||
"dump",
|
"dump",
|
||||||
"file-store",
|
"file-store",
|
||||||
@ -3546,7 +3568,7 @@ dependencies = [
|
|||||||
name = "milli"
|
name = "milli"
|
||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arroy",
|
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bimap",
|
"bimap",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -3681,6 +3703,12 @@ dependencies = [
|
|||||||
"syn 2.0.60",
|
"syn 2.0.60",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mopa-maintained"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mutually_exclusive_features"
|
name = "mutually_exclusive_features"
|
||||||
version = "0.0.3"
|
version = "0.0.3"
|
||||||
@ -4582,9 +4610,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rhai"
|
name = "rhai"
|
||||||
version = "1.19.0"
|
version = "1.20.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4"
|
||||||
checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ahash 0.8.11",
|
"ahash 0.8.11",
|
||||||
"bitflags 2.6.0",
|
"bitflags 2.6.0",
|
||||||
@ -4601,8 +4628,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "rhai_codegen"
|
name = "rhai_codegen"
|
||||||
version = "2.2.0"
|
version = "2.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4"
|
||||||
checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
status = [
|
status = [
|
||||||
'Tests on ubuntu-20.04',
|
'Tests on ubuntu-20.04',
|
||||||
'Tests on macos-12',
|
'Tests on macos-13',
|
||||||
'Tests on windows-2022',
|
'Tests on windows-2022',
|
||||||
'Run Clippy',
|
'Run Clippy',
|
||||||
'Run Rustfmt',
|
'Run Rustfmt',
|
||||||
|
@ -40,7 +40,7 @@ ureq = "2.10.0"
|
|||||||
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = "0.5.0"
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
crossbeam = "0.8.4"
|
crossbeam = "0.8.4"
|
||||||
insta = { version = "1.39.0", features = ["json", "redactions"] }
|
insta = { version = "1.39.0", features = ["json", "redactions"] }
|
||||||
|
@ -1263,7 +1263,7 @@ impl IndexScheduler {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?;
|
self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?;
|
||||||
|
|
||||||
tracing::info!("Batch failed {}", error);
|
tracing::error!("Batch failed {}", error);
|
||||||
|
|
||||||
self.update_task(&mut wtxn, &task)
|
self.update_task(&mut wtxn, &task)
|
||||||
.map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?;
|
.map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?;
|
||||||
|
@ -67,5 +67,8 @@ khmer = ["milli/khmer"]
|
|||||||
vietnamese = ["milli/vietnamese"]
|
vietnamese = ["milli/vietnamese"]
|
||||||
# force swedish character recomposition
|
# force swedish character recomposition
|
||||||
swedish-recomposition = ["milli/swedish-recomposition"]
|
swedish-recomposition = ["milli/swedish-recomposition"]
|
||||||
# force german character recomposition
|
# allow german tokenization
|
||||||
german = ["milli/german"]
|
german = ["milli/german"]
|
||||||
|
# allow turkish normalization
|
||||||
|
turkish = ["milli/turkish"]
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [
|
|||||||
rustls = { version = "0.23.11", features = ["ring"], default-features = false }
|
rustls = { version = "0.23.11", features = ["ring"], default-features = false }
|
||||||
rustls-pki-types = { version = "1.7.0", features = ["alloc"] }
|
rustls-pki-types = { version = "1.7.0", features = ["alloc"] }
|
||||||
rustls-pemfile = "2.1.2"
|
rustls-pemfile = "2.1.2"
|
||||||
segment = { version = "0.2.4", optional = true }
|
segment = { version = "0.2.4" }
|
||||||
serde = { version = "1.0.204", features = ["derive"] }
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||||
sha2 = "0.10.8"
|
sha2 = "0.10.8"
|
||||||
@ -104,6 +104,7 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
|
|||||||
tracing-actix-web = "0.7.11"
|
tracing-actix-web = "0.7.11"
|
||||||
build-info = { version = "1.7.0", path = "../build-info" }
|
build-info = { version = "1.7.0", path = "../build-info" }
|
||||||
roaring = "0.10.2"
|
roaring = "0.10.2"
|
||||||
|
mopa-maintained = "0.2.3"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
actix-rt = "2.10.0"
|
actix-rt = "2.10.0"
|
||||||
@ -131,8 +132,7 @@ tempfile = { version = "3.10.1", optional = true }
|
|||||||
zip = { version = "2.1.3", optional = true }
|
zip = { version = "2.1.3", optional = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
default = ["meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||||
analytics = ["segment"]
|
|
||||||
mini-dashboard = [
|
mini-dashboard = [
|
||||||
"static-files",
|
"static-files",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@ -154,7 +154,8 @@ khmer = ["meilisearch-types/khmer"]
|
|||||||
vietnamese = ["meilisearch-types/vietnamese"]
|
vietnamese = ["meilisearch-types/vietnamese"]
|
||||||
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
||||||
german = ["meilisearch-types/german"]
|
german = ["meilisearch-types/german"]
|
||||||
|
turkish = ["meilisearch-types/turkish"]
|
||||||
|
|
||||||
[package.metadata.mini-dashboard]
|
[package.metadata.mini-dashboard]
|
||||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
|
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip"
|
||||||
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"
|
sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e"
|
||||||
|
@ -1,44 +1,45 @@
|
|||||||
mod mock_analytics;
|
pub mod segment_analytics;
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
mod segment_analytics;
|
|
||||||
|
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use actix_web::HttpRequest;
|
use actix_web::HttpRequest;
|
||||||
|
use index_scheduler::IndexScheduler;
|
||||||
|
use meilisearch_auth::AuthController;
|
||||||
use meilisearch_types::InstanceUid;
|
use meilisearch_types::InstanceUid;
|
||||||
pub use mock_analytics::MockAnalytics;
|
use mopa::mopafy;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use platform_dirs::AppDirs;
|
use platform_dirs::AppDirs;
|
||||||
use serde_json::Value;
|
|
||||||
|
|
||||||
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
|
|
||||||
|
|
||||||
// if the analytics feature is disabled
|
|
||||||
// the `SegmentAnalytics` point to the mock instead of the real analytics
|
|
||||||
#[cfg(not(feature = "analytics"))]
|
|
||||||
pub type SegmentAnalytics = mock_analytics::MockAnalytics;
|
|
||||||
#[cfg(not(feature = "analytics"))]
|
|
||||||
pub type SearchAggregator = mock_analytics::SearchAggregator;
|
|
||||||
#[cfg(not(feature = "analytics"))]
|
|
||||||
pub type SimilarAggregator = mock_analytics::SimilarAggregator;
|
|
||||||
#[cfg(not(feature = "analytics"))]
|
|
||||||
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
|
|
||||||
#[cfg(not(feature = "analytics"))]
|
|
||||||
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
|
|
||||||
|
|
||||||
// if the feature analytics is enabled we use the real analytics
|
// if the feature analytics is enabled we use the real analytics
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
|
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
pub type SearchAggregator = segment_analytics::SearchAggregator;
|
use crate::Opt;
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
pub type SimilarAggregator = segment_analytics::SimilarAggregator;
|
/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name.
|
||||||
#[cfg(feature = "analytics")]
|
#[macro_export]
|
||||||
pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
|
macro_rules! empty_analytics {
|
||||||
#[cfg(feature = "analytics")]
|
($struct_name:ident, $event_name:literal) => {
|
||||||
pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
|
#[derive(Default)]
|
||||||
|
struct $struct_name {}
|
||||||
|
|
||||||
|
impl $crate::analytics::Aggregate for $struct_name {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
$event_name
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, _other: Box<Self>) -> Box<Self> {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::json!({})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/// The Meilisearch config dir:
|
/// The Meilisearch config dir:
|
||||||
/// `~/.config/Meilisearch` on *NIX or *BSD.
|
/// `~/.config/Meilisearch` on *NIX or *BSD.
|
||||||
@ -78,60 +79,88 @@ pub enum DocumentFetchKind {
|
|||||||
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait Analytics: Sync + Send {
|
/// To send an event to segment, your event must be able to aggregate itself with another event of the same type.
|
||||||
fn instance_uid(&self) -> Option<&InstanceUid>;
|
pub trait Aggregate: 'static + mopa::Any + Send {
|
||||||
|
/// The name of the event that will be sent to segment.
|
||||||
|
fn event_name(&self) -> &'static str;
|
||||||
|
|
||||||
|
/// Will be called every time an event has been used twice before segment flushed its buffer.
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self>
|
||||||
|
where
|
||||||
|
Self: Sized;
|
||||||
|
|
||||||
|
/// Converts your structure to the final event that'll be sent to segment.
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
mopafy!(Aggregate);
|
||||||
|
|
||||||
|
/// Helper trait to define multiple aggregates with the same content but a different name.
|
||||||
|
/// Commonly used when you must aggregate a search with POST or with GET, for example.
|
||||||
|
pub trait AggregateMethod: 'static + Default + Send {
|
||||||
|
fn event_name() -> &'static str;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A macro used to quickly define multiple aggregate method with their name
|
||||||
|
/// Usage:
|
||||||
|
/// ```rust
|
||||||
|
/// use meilisearch::aggregate_methods;
|
||||||
|
///
|
||||||
|
/// aggregate_methods!(
|
||||||
|
/// SearchGET => "Documents Searched GET",
|
||||||
|
/// SearchPOST => "Documents Searched POST",
|
||||||
|
/// );
|
||||||
|
/// ```
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! aggregate_methods {
|
||||||
|
($method:ident => $event_name:literal) => {
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct $method {}
|
||||||
|
|
||||||
|
impl $crate::analytics::AggregateMethod for $method {
|
||||||
|
fn event_name() -> &'static str {
|
||||||
|
$event_name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
($($method:ident => $event_name:literal,)+) => {
|
||||||
|
$(
|
||||||
|
aggregate_methods!($method => $event_name);
|
||||||
|
)+
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Analytics {
|
||||||
|
segment: Option<Arc<SegmentAnalytics>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Analytics {
|
||||||
|
pub async fn new(
|
||||||
|
opt: &Opt,
|
||||||
|
index_scheduler: Arc<IndexScheduler>,
|
||||||
|
auth_controller: Arc<AuthController>,
|
||||||
|
) -> Self {
|
||||||
|
if opt.no_analytics {
|
||||||
|
Self { segment: None }
|
||||||
|
} else {
|
||||||
|
Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn no_analytics() -> Self {
|
||||||
|
Self { segment: None }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn instance_uid(&self) -> Option<&InstanceUid> {
|
||||||
|
self.segment.as_ref().map(|segment| segment.instance_uid.as_ref())
|
||||||
|
}
|
||||||
|
|
||||||
/// The method used to publish most analytics that do not need to be batched every hours
|
/// The method used to publish most analytics that do not need to be batched every hours
|
||||||
fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>);
|
pub fn publish<T: Aggregate>(&self, event: T, request: &HttpRequest) {
|
||||||
|
if let Some(ref segment) = self.segment {
|
||||||
/// This method should be called to aggregate a get search
|
let _ = segment.sender.try_send(segment_analytics::Message::new(event, request));
|
||||||
fn get_search(&self, aggregate: SearchAggregator);
|
}
|
||||||
|
}
|
||||||
/// This method should be called to aggregate a post search
|
|
||||||
fn post_search(&self, aggregate: SearchAggregator);
|
|
||||||
|
|
||||||
/// This method should be called to aggregate a get similar request
|
|
||||||
fn get_similar(&self, aggregate: SimilarAggregator);
|
|
||||||
|
|
||||||
/// This method should be called to aggregate a post similar request
|
|
||||||
fn post_similar(&self, aggregate: SimilarAggregator);
|
|
||||||
|
|
||||||
/// This method should be called to aggregate a post array of searches
|
|
||||||
fn post_multi_search(&self, aggregate: MultiSearchAggregator);
|
|
||||||
|
|
||||||
/// This method should be called to aggregate post facet values searches
|
|
||||||
fn post_facet_search(&self, aggregate: FacetSearchAggregator);
|
|
||||||
|
|
||||||
// this method should be called to aggregate an add documents request
|
|
||||||
fn add_documents(
|
|
||||||
&self,
|
|
||||||
documents_query: &UpdateDocumentsQuery,
|
|
||||||
index_creation: bool,
|
|
||||||
request: &HttpRequest,
|
|
||||||
);
|
|
||||||
|
|
||||||
// this method should be called to aggregate a fetch documents request
|
|
||||||
fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
|
|
||||||
|
|
||||||
// this method should be called to aggregate a fetch documents request
|
|
||||||
fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
|
|
||||||
|
|
||||||
// this method should be called to aggregate a add documents request
|
|
||||||
fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest);
|
|
||||||
|
|
||||||
// this method should be called to batch an update documents request
|
|
||||||
fn update_documents(
|
|
||||||
&self,
|
|
||||||
documents_query: &UpdateDocumentsQuery,
|
|
||||||
index_creation: bool,
|
|
||||||
request: &HttpRequest,
|
|
||||||
);
|
|
||||||
|
|
||||||
// this method should be called to batch an update documents by function request
|
|
||||||
fn update_documents_by_function(
|
|
||||||
&self,
|
|
||||||
documents_query: &DocumentEditionByFunction,
|
|
||||||
index_creation: bool,
|
|
||||||
request: &HttpRequest,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -120,7 +120,7 @@ pub fn create_app(
|
|||||||
search_queue: Data<SearchQueue>,
|
search_queue: Data<SearchQueue>,
|
||||||
opt: Opt,
|
opt: Opt,
|
||||||
logs: (LogRouteHandle, LogStderrHandle),
|
logs: (LogRouteHandle, LogStderrHandle),
|
||||||
analytics: Arc<dyn Analytics>,
|
analytics: Data<Analytics>,
|
||||||
enable_dashboard: bool,
|
enable_dashboard: bool,
|
||||||
) -> actix_web::App<
|
) -> actix_web::App<
|
||||||
impl ServiceFactory<
|
impl ServiceFactory<
|
||||||
@ -473,14 +473,14 @@ pub fn configure_data(
|
|||||||
search_queue: Data<SearchQueue>,
|
search_queue: Data<SearchQueue>,
|
||||||
opt: &Opt,
|
opt: &Opt,
|
||||||
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
|
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
|
||||||
analytics: Arc<dyn Analytics>,
|
analytics: Data<Analytics>,
|
||||||
) {
|
) {
|
||||||
let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize;
|
let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize;
|
||||||
config
|
config
|
||||||
.app_data(index_scheduler)
|
.app_data(index_scheduler)
|
||||||
.app_data(auth)
|
.app_data(auth)
|
||||||
.app_data(search_queue)
|
.app_data(search_queue)
|
||||||
.app_data(web::Data::from(analytics))
|
.app_data(analytics)
|
||||||
.app_data(web::Data::new(logs_route))
|
.app_data(web::Data::new(logs_route))
|
||||||
.app_data(web::Data::new(logs_stderr))
|
.app_data(web::Data::new(logs_stderr))
|
||||||
.app_data(web::Data::new(opt.clone()))
|
.app_data(web::Data::new(opt.clone()))
|
||||||
|
@ -5,6 +5,7 @@ use std::path::PathBuf;
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread::available_parallelism;
|
use std::thread::available_parallelism;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use actix_web::http::KeepAlive;
|
use actix_web::http::KeepAlive;
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
@ -123,19 +124,12 @@ async fn try_main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?;
|
let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?;
|
||||||
|
|
||||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
let analytics =
|
||||||
let analytics = if !opt.no_analytics {
|
analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await;
|
||||||
analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone())
|
|
||||||
.await
|
|
||||||
} else {
|
|
||||||
analytics::MockAnalytics::new(&opt)
|
|
||||||
};
|
|
||||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
|
||||||
let analytics = analytics::MockAnalytics::new(&opt);
|
|
||||||
|
|
||||||
print_launch_resume(&opt, analytics.clone(), config_read_from);
|
print_launch_resume(&opt, analytics.clone(), config_read_from);
|
||||||
|
|
||||||
run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?;
|
run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -145,16 +139,23 @@ async fn run_http(
|
|||||||
auth_controller: Arc<AuthController>,
|
auth_controller: Arc<AuthController>,
|
||||||
opt: Opt,
|
opt: Opt,
|
||||||
logs: (LogRouteHandle, LogStderrHandle),
|
logs: (LogRouteHandle, LogStderrHandle),
|
||||||
analytics: Arc<dyn Analytics>,
|
analytics: Arc<Analytics>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let enable_dashboard = &opt.env == "development";
|
let enable_dashboard = &opt.env == "development";
|
||||||
let opt_clone = opt.clone();
|
let opt_clone = opt.clone();
|
||||||
let index_scheduler = Data::from(index_scheduler);
|
let index_scheduler = Data::from(index_scheduler);
|
||||||
let auth_controller = Data::from(auth_controller);
|
let auth_controller = Data::from(auth_controller);
|
||||||
|
let analytics = Data::from(analytics);
|
||||||
let search_queue = SearchQueue::new(
|
let search_queue = SearchQueue::new(
|
||||||
opt.experimental_search_queue_size,
|
opt.experimental_search_queue_size,
|
||||||
available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()),
|
available_parallelism()
|
||||||
);
|
.unwrap_or(NonZeroUsize::new(2).unwrap())
|
||||||
|
.checked_mul(opt.experimental_nb_searches_per_core)
|
||||||
|
.unwrap_or(NonZeroUsize::MAX),
|
||||||
|
)
|
||||||
|
.with_time_to_abort(Duration::from_secs(
|
||||||
|
usize::from(opt.experimental_drop_search_after) as u64
|
||||||
|
));
|
||||||
let search_queue = Data::new(search_queue);
|
let search_queue = Data::new(search_queue);
|
||||||
|
|
||||||
let http_server = HttpServer::new(move || {
|
let http_server = HttpServer::new(move || {
|
||||||
@ -180,11 +181,7 @@ async fn run_http(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn print_launch_resume(
|
pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option<PathBuf>) {
|
||||||
opt: &Opt,
|
|
||||||
analytics: Arc<dyn Analytics>,
|
|
||||||
config_read_from: Option<PathBuf>,
|
|
||||||
) {
|
|
||||||
let build_info = build_info::BuildInfo::from_build();
|
let build_info = build_info::BuildInfo::from_build();
|
||||||
|
|
||||||
let protocol =
|
let protocol =
|
||||||
@ -226,7 +223,6 @@ pub fn print_launch_resume(
|
|||||||
eprintln!("Prototype:\t\t{:?}", prototype);
|
eprintln!("Prototype:\t\t{:?}", prototype);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
|
||||||
{
|
{
|
||||||
if !opt.no_analytics {
|
if !opt.no_analytics {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
|
@ -2,7 +2,7 @@ use std::env::VarError;
|
|||||||
use std::ffi::OsStr;
|
use std::ffi::OsStr;
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::io::{BufReader, Read};
|
use std::io::{BufReader, Read};
|
||||||
use std::num::ParseIntError;
|
use std::num::{NonZeroUsize, ParseIntError};
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@ -29,7 +29,6 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
|
|||||||
const MEILI_ENV: &str = "MEILI_ENV";
|
const MEILI_ENV: &str = "MEILI_ENV";
|
||||||
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
|
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
|
||||||
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
|
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
||||||
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
||||||
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
|
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
|
||||||
@ -55,6 +54,8 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO
|
|||||||
const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER";
|
const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER";
|
||||||
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
||||||
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
|
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
|
||||||
|
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
|
||||||
|
const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE";
|
||||||
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
||||||
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
||||||
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
||||||
@ -208,7 +209,6 @@ pub struct Opt {
|
|||||||
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
||||||
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
|
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
|
||||||
/// at any time.
|
/// at any time.
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
#[serde(default)] // we can't send true
|
#[serde(default)] // we can't send true
|
||||||
#[clap(long, env = MEILI_NO_ANALYTICS)]
|
#[clap(long, env = MEILI_NO_ANALYTICS)]
|
||||||
pub no_analytics: bool,
|
pub no_analytics: bool,
|
||||||
@ -357,10 +357,26 @@ pub struct Opt {
|
|||||||
/// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the
|
/// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the
|
||||||
/// queue is full it starts returning HTTP 503, Service Unavailable.
|
/// queue is full it starts returning HTTP 503, Service Unavailable.
|
||||||
/// The default value is 1000.
|
/// The default value is 1000.
|
||||||
#[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)]
|
#[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())]
|
||||||
#[serde(default)]
|
#[serde(default = "default_experimental_search_queue_size")]
|
||||||
pub experimental_search_queue_size: usize,
|
pub experimental_search_queue_size: usize,
|
||||||
|
|
||||||
|
/// Experimental drop search after. For more information, see: <https://github.com/orgs/meilisearch/discussions/783>
|
||||||
|
///
|
||||||
|
/// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it.
|
||||||
|
/// The default value is 60.
|
||||||
|
#[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())]
|
||||||
|
#[serde(default = "default_drop_search_after")]
|
||||||
|
pub experimental_drop_search_after: NonZeroUsize,
|
||||||
|
|
||||||
|
/// Experimental number of searches per core. For more information, see: <https://github.com/orgs/meilisearch/discussions/784>
|
||||||
|
///
|
||||||
|
/// Lets you customize how many search requests can run on each core concurrently.
|
||||||
|
/// The default value is 4.
|
||||||
|
#[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())]
|
||||||
|
#[serde(default = "default_nb_searches_per_core")]
|
||||||
|
pub experimental_nb_searches_per_core: NonZeroUsize,
|
||||||
|
|
||||||
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
|
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
|
||||||
///
|
///
|
||||||
/// Change the mode of the logs on the console.
|
/// Change the mode of the logs on the console.
|
||||||
@ -407,7 +423,6 @@ pub struct Opt {
|
|||||||
|
|
||||||
impl Opt {
|
impl Opt {
|
||||||
/// Whether analytics should be enabled or not.
|
/// Whether analytics should be enabled or not.
|
||||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
|
||||||
pub fn analytics(&self) -> bool {
|
pub fn analytics(&self) -> bool {
|
||||||
!self.no_analytics
|
!self.no_analytics
|
||||||
}
|
}
|
||||||
@ -487,11 +502,12 @@ impl Opt {
|
|||||||
ignore_missing_dump: _,
|
ignore_missing_dump: _,
|
||||||
ignore_dump_if_db_exists: _,
|
ignore_dump_if_db_exists: _,
|
||||||
config_file_path: _,
|
config_file_path: _,
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
no_analytics,
|
no_analytics,
|
||||||
experimental_contains_filter,
|
experimental_contains_filter,
|
||||||
experimental_enable_metrics,
|
experimental_enable_metrics,
|
||||||
experimental_search_queue_size,
|
experimental_search_queue_size,
|
||||||
|
experimental_drop_search_after,
|
||||||
|
experimental_nb_searches_per_core,
|
||||||
experimental_logs_mode,
|
experimental_logs_mode,
|
||||||
experimental_enable_logs_route,
|
experimental_enable_logs_route,
|
||||||
experimental_replication_parameters,
|
experimental_replication_parameters,
|
||||||
@ -513,10 +529,7 @@ impl Opt {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "analytics")]
|
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||||
{
|
|
||||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
|
||||||
}
|
|
||||||
export_to_env_if_not_present(
|
export_to_env_if_not_present(
|
||||||
MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
|
MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
|
||||||
http_payload_size_limit.to_string(),
|
http_payload_size_limit.to_string(),
|
||||||
@ -559,6 +572,14 @@ impl Opt {
|
|||||||
MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE,
|
MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE,
|
||||||
experimental_search_queue_size.to_string(),
|
experimental_search_queue_size.to_string(),
|
||||||
);
|
);
|
||||||
|
export_to_env_if_not_present(
|
||||||
|
MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER,
|
||||||
|
experimental_drop_search_after.to_string(),
|
||||||
|
);
|
||||||
|
export_to_env_if_not_present(
|
||||||
|
MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE,
|
||||||
|
experimental_nb_searches_per_core.to_string(),
|
||||||
|
);
|
||||||
export_to_env_if_not_present(
|
export_to_env_if_not_present(
|
||||||
MEILI_EXPERIMENTAL_LOGS_MODE,
|
MEILI_EXPERIMENTAL_LOGS_MODE,
|
||||||
experimental_logs_mode.to_string(),
|
experimental_logs_mode.to_string(),
|
||||||
@ -890,6 +911,18 @@ fn default_dump_dir() -> PathBuf {
|
|||||||
PathBuf::from(DEFAULT_DUMP_DIR)
|
PathBuf::from(DEFAULT_DUMP_DIR)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_experimental_search_queue_size() -> usize {
|
||||||
|
1000
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_drop_search_after() -> NonZeroUsize {
|
||||||
|
NonZeroUsize::new(60).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_nb_searches_per_core() -> NonZeroUsize {
|
||||||
|
NonZeroUsize::new(4).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
/// Indicates if a snapshot was scheduled, and if yes with which interval.
|
/// Indicates if a snapshot was scheduled, and if yes with which interval.
|
||||||
#[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
|
||||||
pub enum ScheduleSnapshot {
|
pub enum ScheduleSnapshot {
|
||||||
|
@ -4,7 +4,6 @@ use index_scheduler::IndexScheduler;
|
|||||||
use meilisearch_auth::AuthController;
|
use meilisearch_auth::AuthController;
|
||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::tasks::KindWithContent;
|
use meilisearch_types::tasks::KindWithContent;
|
||||||
use serde_json::json;
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::Analytics;
|
||||||
@ -18,14 +17,16 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
|||||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
|
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
crate::empty_analytics!(DumpAnalytics, "Dump Created");
|
||||||
|
|
||||||
pub async fn create_dump(
|
pub async fn create_dump(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
|
||||||
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
|
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
analytics.publish("Dump Created".to_string(), json!({}), Some(&req));
|
analytics.publish(DumpAnalytics::default(), &req);
|
||||||
|
|
||||||
let task = KindWithContent::DumpCreation {
|
let task = KindWithContent::DumpCreation {
|
||||||
keys: auth_controller.list_keys()?,
|
keys: auth_controller.list_keys()?,
|
||||||
|
@ -6,10 +6,10 @@ use index_scheduler::IndexScheduler;
|
|||||||
use meilisearch_types::deserr::DeserrJsonError;
|
use meilisearch_types::deserr::DeserrJsonError;
|
||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::keys::actions;
|
use meilisearch_types::keys::actions;
|
||||||
use serde_json::json;
|
use serde::Serialize;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::{Aggregate, Analytics};
|
||||||
use crate::extractors::authentication::policies::ActionPolicy;
|
use crate::extractors::authentication::policies::ActionPolicy;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
@ -17,7 +17,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
|||||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||||
cfg.service(
|
cfg.service(
|
||||||
web::resource("")
|
web::resource("")
|
||||||
.route(web::get().to(SeqHandler(get_features)))
|
.route(web::get().to(get_features))
|
||||||
.route(web::patch().to(SeqHandler(patch_features))),
|
.route(web::patch().to(SeqHandler(patch_features))),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -27,12 +27,9 @@ async fn get_features(
|
|||||||
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>,
|
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>,
|
||||||
Data<IndexScheduler>,
|
Data<IndexScheduler>,
|
||||||
>,
|
>,
|
||||||
req: HttpRequest,
|
|
||||||
analytics: Data<dyn Analytics>,
|
|
||||||
) -> HttpResponse {
|
) -> HttpResponse {
|
||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
|
||||||
let features = features.runtime_features();
|
let features = features.runtime_features();
|
||||||
debug!(returns = ?features, "Get features");
|
debug!(returns = ?features, "Get features");
|
||||||
HttpResponse::Ok().json(features)
|
HttpResponse::Ok().json(features)
|
||||||
@ -53,6 +50,35 @@ pub struct RuntimeTogglableFeatures {
|
|||||||
pub contains_filter: Option<bool>,
|
pub contains_filter: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct PatchExperimentalFeatureAnalytics {
|
||||||
|
vector_store: bool,
|
||||||
|
metrics: bool,
|
||||||
|
logs_route: bool,
|
||||||
|
edit_documents_by_function: bool,
|
||||||
|
contains_filter: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for PatchExperimentalFeatureAnalytics {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Experimental features Updated"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
vector_store: new.vector_store,
|
||||||
|
metrics: new.metrics,
|
||||||
|
logs_route: new.logs_route,
|
||||||
|
edit_documents_by_function: new.edit_documents_by_function,
|
||||||
|
contains_filter: new.contains_filter,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn patch_features(
|
async fn patch_features(
|
||||||
index_scheduler: GuardedData<
|
index_scheduler: GuardedData<
|
||||||
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>,
|
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>,
|
||||||
@ -60,7 +86,7 @@ async fn patch_features(
|
|||||||
>,
|
>,
|
||||||
new_features: AwebJson<RuntimeTogglableFeatures, DeserrJsonError>,
|
new_features: AwebJson<RuntimeTogglableFeatures, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: Data<dyn Analytics>,
|
analytics: Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
debug!(parameters = ?new_features, "Patch features");
|
debug!(parameters = ?new_features, "Patch features");
|
||||||
@ -89,15 +115,14 @@ async fn patch_features(
|
|||||||
} = new_features;
|
} = new_features;
|
||||||
|
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Experimental features Updated".to_string(),
|
PatchExperimentalFeatureAnalytics {
|
||||||
json!({
|
vector_store,
|
||||||
"vector_store": vector_store,
|
metrics,
|
||||||
"metrics": metrics,
|
logs_route,
|
||||||
"logs_route": logs_route,
|
edit_documents_by_function,
|
||||||
"edit_documents_by_function": edit_documents_by_function,
|
contains_filter,
|
||||||
"contains_filter": contains_filter,
|
},
|
||||||
}),
|
&req,
|
||||||
Some(&req),
|
|
||||||
);
|
);
|
||||||
index_scheduler.put_runtime_features(new_features)?;
|
index_scheduler.put_runtime_features(new_features)?;
|
||||||
debug!(returns = ?new_features, "Patch features");
|
debug!(returns = ?new_features, "Patch features");
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
use std::io::ErrorKind;
|
use std::io::ErrorKind;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use actix_web::http::header::CONTENT_TYPE;
|
use actix_web::http::header::CONTENT_TYPE;
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
@ -23,14 +25,14 @@ use meilisearch_types::tasks::KindWithContent;
|
|||||||
use meilisearch_types::{milli, Document, Index};
|
use meilisearch_types::{milli, Document, Index};
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use serde::Deserialize;
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use tempfile::tempfile;
|
use tempfile::tempfile;
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter};
|
use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind};
|
use crate::analytics::{Aggregate, AggregateMethod, Analytics};
|
||||||
use crate::error::MeilisearchHttpError;
|
use crate::error::MeilisearchHttpError;
|
||||||
use crate::error::PayloadError::ReceivePayload;
|
use crate::error::PayloadError::ReceivePayload;
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
@ -41,7 +43,7 @@ use crate::routes::{
|
|||||||
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
||||||
};
|
};
|
||||||
use crate::search::{parse_filter, RetrieveVectors};
|
use crate::search::{parse_filter, RetrieveVectors};
|
||||||
use crate::Opt;
|
use crate::{aggregate_methods, Opt};
|
||||||
|
|
||||||
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
||||||
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
|
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
|
||||||
@ -100,12 +102,84 @@ pub struct GetDocument {
|
|||||||
retrieve_vectors: Param<bool>,
|
retrieve_vectors: Param<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
aggregate_methods!(
|
||||||
|
DocumentsGET => "Documents Fetched GET",
|
||||||
|
DocumentsPOST => "Documents Fetched POST",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
|
||||||
|
// a call on ../documents/:doc_id
|
||||||
|
per_document_id: bool,
|
||||||
|
// if a filter was used
|
||||||
|
per_filter: bool,
|
||||||
|
|
||||||
|
#[serde(rename = "vector.retrieve_vectors")]
|
||||||
|
retrieve_vectors: bool,
|
||||||
|
|
||||||
|
// pagination
|
||||||
|
#[serde(rename = "pagination.max_limit")]
|
||||||
|
max_limit: usize,
|
||||||
|
#[serde(rename = "pagination.max_offset")]
|
||||||
|
max_offset: usize,
|
||||||
|
|
||||||
|
marker: std::marker::PhantomData<Method>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
|
pub enum DocumentFetchKind {
|
||||||
|
PerDocumentId { retrieve_vectors: bool },
|
||||||
|
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> DocumentsFetchAggregator<Method> {
|
||||||
|
pub fn from_query(query: &DocumentFetchKind) -> Self {
|
||||||
|
let (limit, offset, retrieve_vectors) = match query {
|
||||||
|
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
|
||||||
|
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
|
||||||
|
(*limit, *offset, *retrieve_vectors)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
|
||||||
|
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
|
||||||
|
max_limit: limit,
|
||||||
|
max_offset: offset,
|
||||||
|
retrieve_vectors,
|
||||||
|
|
||||||
|
marker: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> Aggregate for DocumentsFetchAggregator<Method> {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
Method::event_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
per_document_id: self.per_document_id | new.per_document_id,
|
||||||
|
per_filter: self.per_filter | new.per_filter,
|
||||||
|
retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors,
|
||||||
|
max_limit: self.max_limit.max(new.max_limit),
|
||||||
|
max_offset: self.max_offset.max(new.max_offset),
|
||||||
|
marker: PhantomData,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_document(
|
pub async fn get_document(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
|
||||||
document_param: web::Path<DocumentParam>,
|
document_param: web::Path<DocumentParam>,
|
||||||
params: AwebQueryParameter<GetDocument, DeserrQueryParamError>,
|
params: AwebQueryParameter<GetDocument, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let DocumentParam { index_uid, document_id } = document_param.into_inner();
|
let DocumentParam { index_uid, document_id } = document_param.into_inner();
|
||||||
debug!(parameters = ?params, "Get document");
|
debug!(parameters = ?params, "Get document");
|
||||||
@ -117,8 +191,15 @@ pub async fn get_document(
|
|||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
|
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
|
||||||
|
|
||||||
analytics.get_fetch_documents(
|
analytics.publish(
|
||||||
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
|
DocumentsFetchAggregator::<DocumentsGET> {
|
||||||
|
retrieve_vectors: param_retrieve_vectors.0,
|
||||||
|
per_document_id: true,
|
||||||
|
per_filter: false,
|
||||||
|
max_limit: 0,
|
||||||
|
max_offset: 0,
|
||||||
|
marker: PhantomData,
|
||||||
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -129,17 +210,52 @@ pub async fn get_document(
|
|||||||
Ok(HttpResponse::Ok().json(document))
|
Ok(HttpResponse::Ok().json(document))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct DocumentsDeletionAggregator {
|
||||||
|
per_document_id: bool,
|
||||||
|
clear_all: bool,
|
||||||
|
per_batch: bool,
|
||||||
|
per_filter: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for DocumentsDeletionAggregator {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Documents Deleted"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
per_document_id: self.per_document_id | new.per_document_id,
|
||||||
|
clear_all: self.clear_all | new.clear_all,
|
||||||
|
per_batch: self.per_batch | new.per_batch,
|
||||||
|
per_filter: self.per_filter | new.per_filter,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn delete_document(
|
pub async fn delete_document(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
||||||
path: web::Path<DocumentParam>,
|
path: web::Path<DocumentParam>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let DocumentParam { index_uid, document_id } = path.into_inner();
|
let DocumentParam { index_uid, document_id } = path.into_inner();
|
||||||
let index_uid = IndexUid::try_from(index_uid)?;
|
let index_uid = IndexUid::try_from(index_uid)?;
|
||||||
|
|
||||||
analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req);
|
analytics.publish(
|
||||||
|
DocumentsDeletionAggregator {
|
||||||
|
per_document_id: true,
|
||||||
|
clear_all: false,
|
||||||
|
per_batch: false,
|
||||||
|
per_filter: false,
|
||||||
|
},
|
||||||
|
&req,
|
||||||
|
);
|
||||||
|
|
||||||
let task = KindWithContent::DocumentDeletion {
|
let task = KindWithContent::DocumentDeletion {
|
||||||
index_uid: index_uid.to_string(),
|
index_uid: index_uid.to_string(),
|
||||||
@ -190,17 +306,19 @@ pub async fn documents_by_query_post(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
body: AwebJson<BrowseQuery, DeserrJsonError>,
|
body: AwebJson<BrowseQuery, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let body = body.into_inner();
|
let body = body.into_inner();
|
||||||
debug!(parameters = ?body, "Get documents POST");
|
debug!(parameters = ?body, "Get documents POST");
|
||||||
|
|
||||||
analytics.post_fetch_documents(
|
analytics.publish(
|
||||||
&DocumentFetchKind::Normal {
|
DocumentsFetchAggregator::<DocumentsPOST> {
|
||||||
with_filter: body.filter.is_some(),
|
per_filter: body.filter.is_some(),
|
||||||
limit: body.limit,
|
|
||||||
offset: body.offset,
|
|
||||||
retrieve_vectors: body.retrieve_vectors,
|
retrieve_vectors: body.retrieve_vectors,
|
||||||
|
max_limit: body.limit,
|
||||||
|
max_offset: body.offset,
|
||||||
|
per_document_id: false,
|
||||||
|
marker: PhantomData,
|
||||||
},
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
@ -213,7 +331,7 @@ pub async fn get_documents(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>,
|
params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?params, "Get documents GET");
|
debug!(parameters = ?params, "Get documents GET");
|
||||||
|
|
||||||
@ -235,12 +353,14 @@ pub async fn get_documents(
|
|||||||
filter,
|
filter,
|
||||||
};
|
};
|
||||||
|
|
||||||
analytics.get_fetch_documents(
|
analytics.publish(
|
||||||
&DocumentFetchKind::Normal {
|
DocumentsFetchAggregator::<DocumentsGET> {
|
||||||
with_filter: query.filter.is_some(),
|
per_filter: query.filter.is_some(),
|
||||||
limit: query.limit,
|
|
||||||
offset: query.offset,
|
|
||||||
retrieve_vectors: query.retrieve_vectors,
|
retrieve_vectors: query.retrieve_vectors,
|
||||||
|
max_limit: query.limit,
|
||||||
|
max_offset: query.offset,
|
||||||
|
per_document_id: false,
|
||||||
|
marker: PhantomData,
|
||||||
},
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
@ -298,6 +418,39 @@ fn from_char_csv_delimiter(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
aggregate_methods!(
|
||||||
|
Replaced => "Documents Added",
|
||||||
|
Updated => "Documents Updated",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct DocumentsAggregator<T: AggregateMethod> {
|
||||||
|
payload_types: HashSet<String>,
|
||||||
|
primary_key: HashSet<String>,
|
||||||
|
index_creation: bool,
|
||||||
|
#[serde(skip)]
|
||||||
|
method: PhantomData<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> Aggregate for DocumentsAggregator<Method> {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
Method::event_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
payload_types: self.payload_types.union(&new.payload_types).cloned().collect(),
|
||||||
|
primary_key: self.primary_key.union(&new.primary_key).cloned().collect(),
|
||||||
|
index_creation: self.index_creation | new.index_creation,
|
||||||
|
method: PhantomData,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn replace_documents(
|
pub async fn replace_documents(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
|
||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
@ -305,16 +458,32 @@ pub async fn replace_documents(
|
|||||||
body: Payload,
|
body: Payload,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
debug!(parameters = ?params, "Replace documents");
|
debug!(parameters = ?params, "Replace documents");
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
|
|
||||||
analytics.add_documents(
|
let mut content_types = HashSet::new();
|
||||||
¶ms,
|
let content_type = req
|
||||||
index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|s| s.to_str().ok())
|
||||||
|
.unwrap_or("unknown")
|
||||||
|
.to_string();
|
||||||
|
content_types.insert(content_type);
|
||||||
|
let mut primary_keys = HashSet::new();
|
||||||
|
if let Some(primary_key) = params.primary_key.clone() {
|
||||||
|
primary_keys.insert(primary_key);
|
||||||
|
}
|
||||||
|
analytics.publish(
|
||||||
|
DocumentsAggregator::<Replaced> {
|
||||||
|
payload_types: content_types,
|
||||||
|
primary_key: primary_keys,
|
||||||
|
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||||
|
method: PhantomData,
|
||||||
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -346,16 +515,32 @@ pub async fn update_documents(
|
|||||||
body: Payload,
|
body: Payload,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
debug!(parameters = ?params, "Update documents");
|
debug!(parameters = ?params, "Update documents");
|
||||||
|
|
||||||
analytics.add_documents(
|
let mut content_types = HashSet::new();
|
||||||
¶ms,
|
let content_type = req
|
||||||
index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|s| s.to_str().ok())
|
||||||
|
.unwrap_or("unknown")
|
||||||
|
.to_string();
|
||||||
|
content_types.insert(content_type);
|
||||||
|
let mut primary_keys = HashSet::new();
|
||||||
|
if let Some(primary_key) = params.primary_key.clone() {
|
||||||
|
primary_keys.insert(primary_key);
|
||||||
|
}
|
||||||
|
analytics.publish(
|
||||||
|
DocumentsAggregator::<Updated> {
|
||||||
|
payload_types: content_types,
|
||||||
|
primary_key: primary_keys,
|
||||||
|
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||||
|
method: PhantomData,
|
||||||
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -524,12 +709,20 @@ pub async fn delete_documents_batch(
|
|||||||
body: web::Json<Vec<Value>>,
|
body: web::Json<Vec<Value>>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?body, "Delete documents by batch");
|
debug!(parameters = ?body, "Delete documents by batch");
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
analytics.delete_documents(DocumentDeletionKind::PerBatch, &req);
|
analytics.publish(
|
||||||
|
DocumentsDeletionAggregator {
|
||||||
|
per_batch: true,
|
||||||
|
per_document_id: false,
|
||||||
|
clear_all: false,
|
||||||
|
per_filter: false,
|
||||||
|
},
|
||||||
|
&req,
|
||||||
|
);
|
||||||
|
|
||||||
let ids = body
|
let ids = body
|
||||||
.iter()
|
.iter()
|
||||||
@ -562,14 +755,22 @@ pub async fn delete_documents_by_filter(
|
|||||||
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
|
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?body, "Delete documents by filter");
|
debug!(parameters = ?body, "Delete documents by filter");
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
let index_uid = index_uid.into_inner();
|
let index_uid = index_uid.into_inner();
|
||||||
let filter = body.into_inner().filter;
|
let filter = body.into_inner().filter;
|
||||||
|
|
||||||
analytics.delete_documents(DocumentDeletionKind::PerFilter, &req);
|
analytics.publish(
|
||||||
|
DocumentsDeletionAggregator {
|
||||||
|
per_filter: true,
|
||||||
|
per_document_id: false,
|
||||||
|
clear_all: false,
|
||||||
|
per_batch: false,
|
||||||
|
},
|
||||||
|
&req,
|
||||||
|
);
|
||||||
|
|
||||||
// we ensure the filter is well formed before enqueuing it
|
// we ensure the filter is well formed before enqueuing it
|
||||||
crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())?
|
crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())?
|
||||||
@ -599,13 +800,41 @@ pub struct DocumentEditionByFunction {
|
|||||||
pub function: String,
|
pub function: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct EditDocumentsByFunctionAggregator {
|
||||||
|
// Set to true if at least one request was filtered
|
||||||
|
filtered: bool,
|
||||||
|
// Set to true if at least one request contained a context
|
||||||
|
with_context: bool,
|
||||||
|
|
||||||
|
index_creation: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for EditDocumentsByFunctionAggregator {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Documents Edited By Function"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
filtered: self.filtered | new.filtered,
|
||||||
|
with_context: self.with_context | new.with_context,
|
||||||
|
index_creation: self.index_creation | new.index_creation,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn edit_documents_by_function(
|
pub async fn edit_documents_by_function(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ALL }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ALL }>, Data<IndexScheduler>>,
|
||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebJson<DocumentEditionByFunction, DeserrJsonError>,
|
params: AwebJson<DocumentEditionByFunction, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?params, "Edit documents by function");
|
debug!(parameters = ?params, "Edit documents by function");
|
||||||
|
|
||||||
@ -617,9 +846,12 @@ pub async fn edit_documents_by_function(
|
|||||||
let index_uid = index_uid.into_inner();
|
let index_uid = index_uid.into_inner();
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
|
|
||||||
analytics.update_documents_by_function(
|
analytics.publish(
|
||||||
¶ms,
|
EditDocumentsByFunctionAggregator {
|
||||||
index_scheduler.index(&index_uid).is_err(),
|
filtered: params.filter.is_some(),
|
||||||
|
with_context: params.context.is_some(),
|
||||||
|
index_creation: index_scheduler.index(&index_uid).is_err(),
|
||||||
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -670,10 +902,18 @@ pub async fn clear_all_documents(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
analytics.delete_documents(DocumentDeletionKind::ClearAll, &req);
|
analytics.publish(
|
||||||
|
DocumentsDeletionAggregator {
|
||||||
|
clear_all: true,
|
||||||
|
per_document_id: false,
|
||||||
|
per_batch: false,
|
||||||
|
per_filter: false,
|
||||||
|
},
|
||||||
|
&req,
|
||||||
|
);
|
||||||
|
|
||||||
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
|
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
|
||||||
let uid = get_task_id(&req, &opt)?;
|
let uid = get_task_id(&req, &opt)?;
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
use std::collections::{BinaryHeap, HashSet};
|
||||||
|
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
use actix_web::{web, HttpRequest, HttpResponse};
|
use actix_web::{web, HttpRequest, HttpResponse};
|
||||||
use deserr::actix_web::AwebJson;
|
use deserr::actix_web::AwebJson;
|
||||||
@ -10,14 +12,15 @@ use meilisearch_types::locales::Locale;
|
|||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::{Analytics, FacetSearchAggregator};
|
use crate::analytics::{Aggregate, Analytics};
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::routes::indexes::search::search_kind;
|
use crate::routes::indexes::search::search_kind;
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy,
|
||||||
SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
|
RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||||
|
DEFAULT_SEARCH_OFFSET,
|
||||||
};
|
};
|
||||||
use crate::search_queue::SearchQueue;
|
use crate::search_queue::SearchQueue;
|
||||||
|
|
||||||
@ -53,20 +56,122 @@ pub struct FacetSearchQuery {
|
|||||||
pub locales: Option<Vec<Locale>>,
|
pub locales: Option<Vec<Locale>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct FacetSearchAggregator {
|
||||||
|
// requests
|
||||||
|
total_received: usize,
|
||||||
|
total_succeeded: usize,
|
||||||
|
time_spent: BinaryHeap<usize>,
|
||||||
|
|
||||||
|
// The set of all facetNames that were used
|
||||||
|
facet_names: HashSet<String>,
|
||||||
|
|
||||||
|
// As there been any other parameter than the facetName or facetQuery ones?
|
||||||
|
additional_search_parameters_provided: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetSearchAggregator {
|
||||||
|
#[allow(clippy::field_reassign_with_default)]
|
||||||
|
pub fn from_query(query: &FacetSearchQuery) -> Self {
|
||||||
|
let FacetSearchQuery {
|
||||||
|
facet_query: _,
|
||||||
|
facet_name,
|
||||||
|
vector,
|
||||||
|
q,
|
||||||
|
filter,
|
||||||
|
matching_strategy,
|
||||||
|
attributes_to_search_on,
|
||||||
|
hybrid,
|
||||||
|
ranking_score_threshold,
|
||||||
|
locales,
|
||||||
|
} = query;
|
||||||
|
|
||||||
|
Self {
|
||||||
|
total_received: 1,
|
||||||
|
facet_names: Some(facet_name.clone()).into_iter().collect(),
|
||||||
|
additional_search_parameters_provided: q.is_some()
|
||||||
|
|| vector.is_some()
|
||||||
|
|| filter.is_some()
|
||||||
|
|| *matching_strategy != MatchingStrategy::default()
|
||||||
|
|| attributes_to_search_on.is_some()
|
||||||
|
|| hybrid.is_some()
|
||||||
|
|| ranking_score_threshold.is_some()
|
||||||
|
|| locales.is_some(),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn succeed(&mut self, result: &FacetSearchResult) {
|
||||||
|
let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result;
|
||||||
|
self.total_succeeded = 1;
|
||||||
|
self.time_spent.push(*processing_time_ms as usize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for FacetSearchAggregator {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Facet Searched POST"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
for time in new.time_spent {
|
||||||
|
self.time_spent.push(time);
|
||||||
|
}
|
||||||
|
|
||||||
|
Box::new(Self {
|
||||||
|
total_received: self.total_received.saturating_add(new.total_received),
|
||||||
|
total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded),
|
||||||
|
time_spent: self.time_spent,
|
||||||
|
facet_names: self.facet_names.union(&new.facet_names).cloned().collect(),
|
||||||
|
additional_search_parameters_provided: self.additional_search_parameters_provided
|
||||||
|
| new.additional_search_parameters_provided,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
time_spent,
|
||||||
|
facet_names,
|
||||||
|
additional_search_parameters_provided,
|
||||||
|
} = *self;
|
||||||
|
// the index of the 99th percentage of value
|
||||||
|
let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.;
|
||||||
|
// we get all the values in a sorted manner
|
||||||
|
let time_spent = time_spent.into_sorted_vec();
|
||||||
|
// We are only interested by the slowest value of the 99th fastest results
|
||||||
|
let time_spent = time_spent.get(percentile_99th as usize);
|
||||||
|
|
||||||
|
serde_json::json!({
|
||||||
|
"requests": {
|
||||||
|
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||||
|
"total_succeeded": total_succeeded,
|
||||||
|
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||||
|
"total_received": total_received,
|
||||||
|
},
|
||||||
|
"facets": {
|
||||||
|
"total_distinct_facet_count": facet_names.len(),
|
||||||
|
"additional_search_parameters_provided": additional_search_parameters_provided,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn search(
|
pub async fn search(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>,
|
||||||
search_queue: Data<SearchQueue>,
|
search_queue: Data<SearchQueue>,
|
||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebJson<FacetSearchQuery, DeserrJsonError>,
|
params: AwebJson<FacetSearchQuery, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
let query = params.into_inner();
|
let query = params.into_inner();
|
||||||
debug!(parameters = ?query, "Facet search");
|
debug!(parameters = ?query, "Facet search");
|
||||||
|
|
||||||
let mut aggregate = FacetSearchAggregator::from_query(&query, &req);
|
let mut aggregate = FacetSearchAggregator::from_query(&query);
|
||||||
|
|
||||||
let facet_query = query.facet_query.clone();
|
let facet_query = query.facet_query.clone();
|
||||||
let facet_name = query.facet_name.clone();
|
let facet_name = query.facet_name.clone();
|
||||||
@ -100,7 +205,7 @@ pub async fn search(
|
|||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
aggregate.succeed(search_result);
|
aggregate.succeed(search_result);
|
||||||
}
|
}
|
||||||
analytics.post_facet_search(aggregate);
|
analytics.publish(aggregate, &req);
|
||||||
|
|
||||||
let search_result = search_result?;
|
let search_result = search_result?;
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
use std::convert::Infallible;
|
use std::convert::Infallible;
|
||||||
|
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
@ -13,12 +14,11 @@ use meilisearch_types::index_uid::IndexUid;
|
|||||||
use meilisearch_types::milli::{self, FieldDistribution, Index};
|
use meilisearch_types::milli::{self, FieldDistribution, Index};
|
||||||
use meilisearch_types::tasks::KindWithContent;
|
use meilisearch_types::tasks::KindWithContent;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json::json;
|
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::{Aggregate, Analytics};
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
@ -28,8 +28,11 @@ use crate::Opt;
|
|||||||
pub mod documents;
|
pub mod documents;
|
||||||
pub mod facet_search;
|
pub mod facet_search;
|
||||||
pub mod search;
|
pub mod search;
|
||||||
|
mod search_analytics;
|
||||||
pub mod settings;
|
pub mod settings;
|
||||||
|
mod settings_analytics;
|
||||||
pub mod similar;
|
pub mod similar;
|
||||||
|
mod similar_analytics;
|
||||||
|
|
||||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||||
cfg.service(
|
cfg.service(
|
||||||
@ -123,12 +126,31 @@ pub struct IndexCreateRequest {
|
|||||||
primary_key: Option<String>,
|
primary_key: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct IndexCreatedAggregate {
|
||||||
|
primary_key: BTreeSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for IndexCreatedAggregate {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Index Created"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn create_index(
|
pub async fn create_index(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
|
||||||
body: AwebJson<IndexCreateRequest, DeserrJsonError>,
|
body: AwebJson<IndexCreateRequest, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?body, "Create index");
|
debug!(parameters = ?body, "Create index");
|
||||||
let IndexCreateRequest { primary_key, uid } = body.into_inner();
|
let IndexCreateRequest { primary_key, uid } = body.into_inner();
|
||||||
@ -136,9 +158,8 @@ pub async fn create_index(
|
|||||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid);
|
let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid);
|
||||||
if allow_index_creation {
|
if allow_index_creation {
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Index Created".to_string(),
|
IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() },
|
||||||
json!({ "primary_key": primary_key }),
|
&req,
|
||||||
Some(&req),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
|
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
|
||||||
@ -194,21 +215,38 @@ pub async fn get_index(
|
|||||||
Ok(HttpResponse::Ok().json(index_view))
|
Ok(HttpResponse::Ok().json(index_view))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct IndexUpdatedAggregate {
|
||||||
|
primary_key: BTreeSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for IndexUpdatedAggregate {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Index Updated"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
pub async fn update_index(
|
pub async fn update_index(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_UPDATE }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_UPDATE }>, Data<IndexScheduler>>,
|
||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
|
body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?body, "Update index");
|
debug!(parameters = ?body, "Update index");
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
let body = body.into_inner();
|
let body = body.into_inner();
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Index Updated".to_string(),
|
IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() },
|
||||||
json!({ "primary_key": body.primary_key }),
|
&req,
|
||||||
Some(&req),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let task = KindWithContent::IndexUpdate {
|
let task = KindWithContent::IndexUpdate {
|
||||||
|
@ -13,12 +13,13 @@ use meilisearch_types::serde_cs::vec::CS;
|
|||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::{Analytics, SearchAggregator};
|
use crate::analytics::Analytics;
|
||||||
use crate::error::MeilisearchHttpError;
|
use crate::error::MeilisearchHttpError;
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
||||||
|
use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST};
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
||||||
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
|
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
|
||||||
@ -225,7 +226,7 @@ pub async fn search_with_url_query(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebQueryParameter<SearchQueryGet, DeserrQueryParamError>,
|
params: AwebQueryParameter<SearchQueryGet, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?params, "Search get");
|
debug!(parameters = ?params, "Search get");
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
@ -237,7 +238,7 @@ pub async fn search_with_url_query(
|
|||||||
add_search_rules(&mut query.filter, search_rules);
|
add_search_rules(&mut query.filter, search_rules);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
let mut aggregate = SearchAggregator::<SearchGET>::from_query(&query);
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
@ -254,7 +255,7 @@ pub async fn search_with_url_query(
|
|||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
aggregate.succeed(search_result);
|
aggregate.succeed(search_result);
|
||||||
}
|
}
|
||||||
analytics.get_search(aggregate);
|
analytics.publish(aggregate, &req);
|
||||||
|
|
||||||
let search_result = search_result?;
|
let search_result = search_result?;
|
||||||
|
|
||||||
@ -268,7 +269,7 @@ pub async fn search_with_post(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebJson<SearchQuery, DeserrJsonError>,
|
params: AwebJson<SearchQuery, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
@ -280,7 +281,7 @@ pub async fn search_with_post(
|
|||||||
add_search_rules(&mut query.filter, search_rules);
|
add_search_rules(&mut query.filter, search_rules);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
let mut aggregate = SearchAggregator::<SearchPOST>::from_query(&query);
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
|
|
||||||
@ -302,7 +303,7 @@ pub async fn search_with_post(
|
|||||||
MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc();
|
MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
analytics.post_search(aggregate);
|
analytics.publish(aggregate, &req);
|
||||||
|
|
||||||
let search_result = search_result?;
|
let search_result = search_result?;
|
||||||
|
|
||||||
|
483
crates/meilisearch/src/routes/indexes/search_analytics.rs
Normal file
483
crates/meilisearch/src/routes/indexes/search_analytics.rs
Normal file
@ -0,0 +1,483 @@
|
|||||||
|
use std::collections::{BTreeSet, BinaryHeap, HashMap};
|
||||||
|
|
||||||
|
use meilisearch_types::locales::Locale;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use crate::aggregate_methods;
|
||||||
|
use crate::analytics::{Aggregate, AggregateMethod};
|
||||||
|
use crate::search::{
|
||||||
|
SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||||
|
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||||
|
DEFAULT_SEMANTIC_RATIO,
|
||||||
|
};
|
||||||
|
|
||||||
|
aggregate_methods!(
|
||||||
|
SearchGET => "Documents Searched GET",
|
||||||
|
SearchPOST => "Documents Searched POST",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct SearchAggregator<Method: AggregateMethod> {
|
||||||
|
// requests
|
||||||
|
total_received: usize,
|
||||||
|
total_succeeded: usize,
|
||||||
|
total_degraded: usize,
|
||||||
|
total_used_negative_operator: usize,
|
||||||
|
time_spent: BinaryHeap<usize>,
|
||||||
|
|
||||||
|
// sort
|
||||||
|
sort_with_geo_point: bool,
|
||||||
|
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||||
|
sort_sum_of_criteria_terms: usize,
|
||||||
|
// every time a request has a filter, this field must be incremented by one
|
||||||
|
sort_total_number_of_criteria: usize,
|
||||||
|
|
||||||
|
// distinct
|
||||||
|
distinct: bool,
|
||||||
|
|
||||||
|
// filter
|
||||||
|
filter_with_geo_radius: bool,
|
||||||
|
filter_with_geo_bounding_box: bool,
|
||||||
|
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||||
|
filter_sum_of_criteria_terms: usize,
|
||||||
|
// every time a request has a filter, this field must be incremented by one
|
||||||
|
filter_total_number_of_criteria: usize,
|
||||||
|
used_syntax: HashMap<String, usize>,
|
||||||
|
|
||||||
|
// attributes_to_search_on
|
||||||
|
// every time a search is done using attributes_to_search_on
|
||||||
|
attributes_to_search_on_total_number_of_uses: usize,
|
||||||
|
|
||||||
|
// q
|
||||||
|
// The maximum number of terms in a q request
|
||||||
|
max_terms_number: usize,
|
||||||
|
|
||||||
|
// vector
|
||||||
|
// The maximum number of floats in a vector request
|
||||||
|
max_vector_size: usize,
|
||||||
|
// Whether the semantic ratio passed to a hybrid search equals the default ratio.
|
||||||
|
semantic_ratio: bool,
|
||||||
|
hybrid: bool,
|
||||||
|
retrieve_vectors: bool,
|
||||||
|
|
||||||
|
// every time a search is done, we increment the counter linked to the used settings
|
||||||
|
matching_strategy: HashMap<String, usize>,
|
||||||
|
|
||||||
|
// List of the unique Locales passed as parameter
|
||||||
|
locales: BTreeSet<Locale>,
|
||||||
|
|
||||||
|
// pagination
|
||||||
|
max_limit: usize,
|
||||||
|
max_offset: usize,
|
||||||
|
finite_pagination: usize,
|
||||||
|
|
||||||
|
// formatting
|
||||||
|
max_attributes_to_retrieve: usize,
|
||||||
|
max_attributes_to_highlight: usize,
|
||||||
|
highlight_pre_tag: bool,
|
||||||
|
highlight_post_tag: bool,
|
||||||
|
max_attributes_to_crop: usize,
|
||||||
|
crop_marker: bool,
|
||||||
|
show_matches_position: bool,
|
||||||
|
crop_length: bool,
|
||||||
|
|
||||||
|
// facets
|
||||||
|
facets_sum_of_terms: usize,
|
||||||
|
facets_total_number_of_facets: usize,
|
||||||
|
|
||||||
|
// scoring
|
||||||
|
show_ranking_score: bool,
|
||||||
|
show_ranking_score_details: bool,
|
||||||
|
ranking_score_threshold: bool,
|
||||||
|
|
||||||
|
marker: std::marker::PhantomData<Method>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> SearchAggregator<Method> {
|
||||||
|
#[allow(clippy::field_reassign_with_default)]
|
||||||
|
pub fn from_query(query: &SearchQuery) -> Self {
|
||||||
|
let SearchQuery {
|
||||||
|
q,
|
||||||
|
vector,
|
||||||
|
offset,
|
||||||
|
limit,
|
||||||
|
page,
|
||||||
|
hits_per_page,
|
||||||
|
attributes_to_retrieve: _,
|
||||||
|
retrieve_vectors,
|
||||||
|
attributes_to_crop: _,
|
||||||
|
crop_length,
|
||||||
|
attributes_to_highlight: _,
|
||||||
|
show_matches_position,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
filter,
|
||||||
|
sort,
|
||||||
|
distinct,
|
||||||
|
facets: _,
|
||||||
|
highlight_pre_tag,
|
||||||
|
highlight_post_tag,
|
||||||
|
crop_marker,
|
||||||
|
matching_strategy,
|
||||||
|
attributes_to_search_on,
|
||||||
|
hybrid,
|
||||||
|
ranking_score_threshold,
|
||||||
|
locales,
|
||||||
|
} = query;
|
||||||
|
|
||||||
|
let mut ret = Self::default();
|
||||||
|
|
||||||
|
ret.total_received = 1;
|
||||||
|
|
||||||
|
if let Some(ref sort) = sort {
|
||||||
|
ret.sort_total_number_of_criteria = 1;
|
||||||
|
ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint("));
|
||||||
|
ret.sort_sum_of_criteria_terms = sort.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.distinct = distinct.is_some();
|
||||||
|
|
||||||
|
if let Some(ref filter) = filter {
|
||||||
|
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
||||||
|
ret.filter_total_number_of_criteria = 1;
|
||||||
|
|
||||||
|
let syntax = match filter {
|
||||||
|
Value::String(_) => "string".to_string(),
|
||||||
|
Value::Array(values) => {
|
||||||
|
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
|
||||||
|
"mixed".to_string()
|
||||||
|
} else {
|
||||||
|
"array".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => "none".to_string(),
|
||||||
|
};
|
||||||
|
// convert the string to a HashMap
|
||||||
|
ret.used_syntax.insert(syntax, 1);
|
||||||
|
|
||||||
|
let stringified_filters = filter.to_string();
|
||||||
|
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
|
||||||
|
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
|
||||||
|
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
// attributes_to_search_on
|
||||||
|
if attributes_to_search_on.is_some() {
|
||||||
|
ret.attributes_to_search_on_total_number_of_uses = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ref q) = q {
|
||||||
|
ret.max_terms_number = q.split_whitespace().count();
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ref vector) = vector {
|
||||||
|
ret.max_vector_size = vector.len();
|
||||||
|
}
|
||||||
|
ret.retrieve_vectors |= retrieve_vectors;
|
||||||
|
|
||||||
|
if query.is_finite_pagination() {
|
||||||
|
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
|
||||||
|
ret.max_limit = limit;
|
||||||
|
ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit;
|
||||||
|
ret.finite_pagination = 1;
|
||||||
|
} else {
|
||||||
|
ret.max_limit = *limit;
|
||||||
|
ret.max_offset = *offset;
|
||||||
|
ret.finite_pagination = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1);
|
||||||
|
|
||||||
|
if let Some(locales) = locales {
|
||||||
|
ret.locales = locales.iter().copied().collect();
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG();
|
||||||
|
ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG();
|
||||||
|
ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER();
|
||||||
|
ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH();
|
||||||
|
ret.show_matches_position = *show_matches_position;
|
||||||
|
|
||||||
|
ret.show_ranking_score = *show_ranking_score;
|
||||||
|
ret.show_ranking_score_details = *show_ranking_score_details;
|
||||||
|
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||||
|
|
||||||
|
if let Some(hybrid) = hybrid {
|
||||||
|
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
|
||||||
|
ret.hybrid = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn succeed(&mut self, result: &SearchResult) {
|
||||||
|
let SearchResult {
|
||||||
|
hits: _,
|
||||||
|
query: _,
|
||||||
|
processing_time_ms,
|
||||||
|
hits_info: _,
|
||||||
|
semantic_hit_count: _,
|
||||||
|
facet_distribution: _,
|
||||||
|
facet_stats: _,
|
||||||
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
|
} = result;
|
||||||
|
|
||||||
|
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||||
|
if *degraded {
|
||||||
|
self.total_degraded = self.total_degraded.saturating_add(1);
|
||||||
|
}
|
||||||
|
if *used_negative_operator {
|
||||||
|
self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1);
|
||||||
|
}
|
||||||
|
self.time_spent.push(*processing_time_ms as usize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
Method::event_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
mut time_spent,
|
||||||
|
sort_with_geo_point,
|
||||||
|
sort_sum_of_criteria_terms,
|
||||||
|
sort_total_number_of_criteria,
|
||||||
|
distinct,
|
||||||
|
filter_with_geo_radius,
|
||||||
|
filter_with_geo_bounding_box,
|
||||||
|
filter_sum_of_criteria_terms,
|
||||||
|
filter_total_number_of_criteria,
|
||||||
|
used_syntax,
|
||||||
|
attributes_to_search_on_total_number_of_uses,
|
||||||
|
max_terms_number,
|
||||||
|
max_vector_size,
|
||||||
|
retrieve_vectors,
|
||||||
|
matching_strategy,
|
||||||
|
max_limit,
|
||||||
|
max_offset,
|
||||||
|
finite_pagination,
|
||||||
|
max_attributes_to_retrieve,
|
||||||
|
max_attributes_to_highlight,
|
||||||
|
highlight_pre_tag,
|
||||||
|
highlight_post_tag,
|
||||||
|
max_attributes_to_crop,
|
||||||
|
crop_marker,
|
||||||
|
show_matches_position,
|
||||||
|
crop_length,
|
||||||
|
facets_sum_of_terms,
|
||||||
|
facets_total_number_of_facets,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
semantic_ratio,
|
||||||
|
hybrid,
|
||||||
|
total_degraded,
|
||||||
|
total_used_negative_operator,
|
||||||
|
ranking_score_threshold,
|
||||||
|
mut locales,
|
||||||
|
marker: _,
|
||||||
|
} = *new;
|
||||||
|
|
||||||
|
// request
|
||||||
|
self.total_received = self.total_received.saturating_add(total_received);
|
||||||
|
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
||||||
|
self.total_degraded = self.total_degraded.saturating_add(total_degraded);
|
||||||
|
self.total_used_negative_operator =
|
||||||
|
self.total_used_negative_operator.saturating_add(total_used_negative_operator);
|
||||||
|
self.time_spent.append(&mut time_spent);
|
||||||
|
|
||||||
|
// sort
|
||||||
|
self.sort_with_geo_point |= sort_with_geo_point;
|
||||||
|
self.sort_sum_of_criteria_terms =
|
||||||
|
self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms);
|
||||||
|
self.sort_total_number_of_criteria =
|
||||||
|
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
|
||||||
|
|
||||||
|
// distinct
|
||||||
|
self.distinct |= distinct;
|
||||||
|
|
||||||
|
// filter
|
||||||
|
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||||
|
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||||
|
self.filter_sum_of_criteria_terms =
|
||||||
|
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
|
||||||
|
self.filter_total_number_of_criteria =
|
||||||
|
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
|
||||||
|
for (key, value) in used_syntax.into_iter() {
|
||||||
|
let used_syntax = self.used_syntax.entry(key).or_insert(0);
|
||||||
|
*used_syntax = used_syntax.saturating_add(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// attributes_to_search_on
|
||||||
|
self.attributes_to_search_on_total_number_of_uses = self
|
||||||
|
.attributes_to_search_on_total_number_of_uses
|
||||||
|
.saturating_add(attributes_to_search_on_total_number_of_uses);
|
||||||
|
|
||||||
|
// q
|
||||||
|
self.max_terms_number = self.max_terms_number.max(max_terms_number);
|
||||||
|
|
||||||
|
// vector
|
||||||
|
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
||||||
|
self.retrieve_vectors |= retrieve_vectors;
|
||||||
|
self.semantic_ratio |= semantic_ratio;
|
||||||
|
self.hybrid |= hybrid;
|
||||||
|
|
||||||
|
// pagination
|
||||||
|
self.max_limit = self.max_limit.max(max_limit);
|
||||||
|
self.max_offset = self.max_offset.max(max_offset);
|
||||||
|
self.finite_pagination += finite_pagination;
|
||||||
|
|
||||||
|
// formatting
|
||||||
|
self.max_attributes_to_retrieve =
|
||||||
|
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
|
||||||
|
self.max_attributes_to_highlight =
|
||||||
|
self.max_attributes_to_highlight.max(max_attributes_to_highlight);
|
||||||
|
self.highlight_pre_tag |= highlight_pre_tag;
|
||||||
|
self.highlight_post_tag |= highlight_post_tag;
|
||||||
|
self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop);
|
||||||
|
self.crop_marker |= crop_marker;
|
||||||
|
self.show_matches_position |= show_matches_position;
|
||||||
|
self.crop_length |= crop_length;
|
||||||
|
|
||||||
|
// facets
|
||||||
|
self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms);
|
||||||
|
self.facets_total_number_of_facets =
|
||||||
|
self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets);
|
||||||
|
|
||||||
|
// matching strategy
|
||||||
|
for (key, value) in matching_strategy.into_iter() {
|
||||||
|
let matching_strategy = self.matching_strategy.entry(key).or_insert(0);
|
||||||
|
*matching_strategy = matching_strategy.saturating_add(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scoring
|
||||||
|
self.show_ranking_score |= show_ranking_score;
|
||||||
|
self.show_ranking_score_details |= show_ranking_score_details;
|
||||||
|
self.ranking_score_threshold |= ranking_score_threshold;
|
||||||
|
|
||||||
|
// locales
|
||||||
|
self.locales.append(&mut locales);
|
||||||
|
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
time_spent,
|
||||||
|
sort_with_geo_point,
|
||||||
|
sort_sum_of_criteria_terms,
|
||||||
|
sort_total_number_of_criteria,
|
||||||
|
distinct,
|
||||||
|
filter_with_geo_radius,
|
||||||
|
filter_with_geo_bounding_box,
|
||||||
|
filter_sum_of_criteria_terms,
|
||||||
|
filter_total_number_of_criteria,
|
||||||
|
used_syntax,
|
||||||
|
attributes_to_search_on_total_number_of_uses,
|
||||||
|
max_terms_number,
|
||||||
|
max_vector_size,
|
||||||
|
retrieve_vectors,
|
||||||
|
matching_strategy,
|
||||||
|
max_limit,
|
||||||
|
max_offset,
|
||||||
|
finite_pagination,
|
||||||
|
max_attributes_to_retrieve,
|
||||||
|
max_attributes_to_highlight,
|
||||||
|
highlight_pre_tag,
|
||||||
|
highlight_post_tag,
|
||||||
|
max_attributes_to_crop,
|
||||||
|
crop_marker,
|
||||||
|
show_matches_position,
|
||||||
|
crop_length,
|
||||||
|
facets_sum_of_terms,
|
||||||
|
facets_total_number_of_facets,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
semantic_ratio,
|
||||||
|
hybrid,
|
||||||
|
total_degraded,
|
||||||
|
total_used_negative_operator,
|
||||||
|
ranking_score_threshold,
|
||||||
|
locales,
|
||||||
|
marker: _,
|
||||||
|
} = *self;
|
||||||
|
|
||||||
|
// we get all the values in a sorted manner
|
||||||
|
let time_spent = time_spent.into_sorted_vec();
|
||||||
|
// the index of the 99th percentage of value
|
||||||
|
let percentile_99th = time_spent.len() * 99 / 100;
|
||||||
|
// We are only interested by the slowest value of the 99th fastest results
|
||||||
|
let time_spent = time_spent.get(percentile_99th);
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"requests": {
|
||||||
|
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||||
|
"total_succeeded": total_succeeded,
|
||||||
|
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||||
|
"total_received": total_received,
|
||||||
|
"total_degraded": total_degraded,
|
||||||
|
"total_used_negative_operator": total_used_negative_operator,
|
||||||
|
},
|
||||||
|
"sort": {
|
||||||
|
"with_geoPoint": sort_with_geo_point,
|
||||||
|
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
|
||||||
|
},
|
||||||
|
"distinct": distinct,
|
||||||
|
"filter": {
|
||||||
|
"with_geoRadius": filter_with_geo_radius,
|
||||||
|
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||||
|
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||||
|
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||||
|
},
|
||||||
|
"attributes_to_search_on": {
|
||||||
|
"total_number_of_uses": attributes_to_search_on_total_number_of_uses,
|
||||||
|
},
|
||||||
|
"q": {
|
||||||
|
"max_terms_number": max_terms_number,
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"max_vector_size": max_vector_size,
|
||||||
|
"retrieve_vectors": retrieve_vectors,
|
||||||
|
},
|
||||||
|
"hybrid": {
|
||||||
|
"enabled": hybrid,
|
||||||
|
"semantic_ratio": semantic_ratio,
|
||||||
|
},
|
||||||
|
"pagination": {
|
||||||
|
"max_limit": max_limit,
|
||||||
|
"max_offset": max_offset,
|
||||||
|
"most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" },
|
||||||
|
},
|
||||||
|
"formatting": {
|
||||||
|
"max_attributes_to_retrieve": max_attributes_to_retrieve,
|
||||||
|
"max_attributes_to_highlight": max_attributes_to_highlight,
|
||||||
|
"highlight_pre_tag": highlight_pre_tag,
|
||||||
|
"highlight_post_tag": highlight_post_tag,
|
||||||
|
"max_attributes_to_crop": max_attributes_to_crop,
|
||||||
|
"crop_marker": crop_marker,
|
||||||
|
"show_matches_position": show_matches_position,
|
||||||
|
"crop_length": crop_length,
|
||||||
|
},
|
||||||
|
"facets": {
|
||||||
|
"avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64),
|
||||||
|
},
|
||||||
|
"matching_strategy": {
|
||||||
|
"most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||||
|
},
|
||||||
|
"locales": locales,
|
||||||
|
"scoring": {
|
||||||
|
"show_ranking_score": show_ranking_score,
|
||||||
|
"show_ranking_score_details": show_ranking_score_details,
|
||||||
|
"ranking_score_threshold": ranking_score_threshold,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -4,14 +4,13 @@ use deserr::actix_web::AwebJson;
|
|||||||
use index_scheduler::IndexScheduler;
|
use index_scheduler::IndexScheduler;
|
||||||
use meilisearch_types::deserr::DeserrJsonError;
|
use meilisearch_types::deserr::DeserrJsonError;
|
||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::milli::update::Setting;
|
use meilisearch_types::milli::update::Setting;
|
||||||
use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked};
|
use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked};
|
||||||
use meilisearch_types::tasks::KindWithContent;
|
use meilisearch_types::tasks::KindWithContent;
|
||||||
use serde_json::json;
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::settings_analytics::*;
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::Analytics;
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
@ -20,7 +19,7 @@ use crate::Opt;
|
|||||||
|
|
||||||
#[macro_export]
|
#[macro_export]
|
||||||
macro_rules! make_setting_route {
|
macro_rules! make_setting_route {
|
||||||
($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => {
|
($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => {
|
||||||
pub mod $attr {
|
pub mod $attr {
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
use actix_web::{web, HttpRequest, HttpResponse, Resource};
|
use actix_web::{web, HttpRequest, HttpResponse, Resource};
|
||||||
@ -80,7 +79,7 @@ macro_rules! make_setting_route {
|
|||||||
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
|
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
$analytics_var: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> std::result::Result<HttpResponse, ResponseError> {
|
) -> std::result::Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
@ -88,7 +87,10 @@ macro_rules! make_setting_route {
|
|||||||
debug!(parameters = ?body, "Update settings");
|
debug!(parameters = ?body, "Update settings");
|
||||||
|
|
||||||
#[allow(clippy::redundant_closure_call)]
|
#[allow(clippy::redundant_closure_call)]
|
||||||
$analytics(&body, &req);
|
analytics.publish(
|
||||||
|
$crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(),
|
||||||
|
&req,
|
||||||
|
);
|
||||||
|
|
||||||
let new_settings = Settings {
|
let new_settings = Settings {
|
||||||
$attr: match body {
|
$attr: match body {
|
||||||
@ -160,21 +162,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
filterable_attributes,
|
filterable_attributes,
|
||||||
"filterableAttributes",
|
"filterableAttributes",
|
||||||
analytics,
|
FilterableAttributesAnalytics
|
||||||
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"FilterableAttributes Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"filterable_attributes": {
|
|
||||||
"total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0),
|
|
||||||
"has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -186,21 +174,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
sortable_attributes,
|
sortable_attributes,
|
||||||
"sortableAttributes",
|
"sortableAttributes",
|
||||||
analytics,
|
SortableAttributesAnalytics
|
||||||
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"SortableAttributes Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"sortable_attributes": {
|
|
||||||
"total": setting.as_ref().map(|sort| sort.len()),
|
|
||||||
"has_geo": setting.as_ref().map(|sort| sort.contains("_geo")),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -212,21 +186,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
displayed_attributes,
|
displayed_attributes,
|
||||||
"displayedAttributes",
|
"displayedAttributes",
|
||||||
analytics,
|
DisplayedAttributesAnalytics
|
||||||
|displayed: &Option<Vec<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"DisplayedAttributes Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"displayed_attributes": {
|
|
||||||
"total": displayed.as_ref().map(|displayed| displayed.len()),
|
|
||||||
"with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -238,40 +198,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
typo_tolerance,
|
typo_tolerance,
|
||||||
"typoTolerance",
|
"typoTolerance",
|
||||||
analytics,
|
TypoToleranceAnalytics
|
||||||
|setting: &Option<meilisearch_types::settings::TypoSettings>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"TypoTolerance Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"typo_tolerance": {
|
|
||||||
"enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
|
|
||||||
"disable_on_attributes": setting
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
|
||||||
"disable_on_words": setting
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
|
||||||
"min_word_size_for_one_typo": setting
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|s| s.min_word_size_for_typos
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.map(|s| s.one_typo.set()))
|
|
||||||
.flatten(),
|
|
||||||
"min_word_size_for_two_typos": setting
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|s| s.min_word_size_for_typos
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.map(|s| s.two_typos.set()))
|
|
||||||
.flatten(),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -283,21 +210,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
searchable_attributes,
|
searchable_attributes,
|
||||||
"searchableAttributes",
|
"searchableAttributes",
|
||||||
analytics,
|
SearchableAttributesAnalytics
|
||||||
|setting: &Option<Vec<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"SearchableAttributes Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"searchable_attributes": {
|
|
||||||
"total": setting.as_ref().map(|searchable| searchable.len()),
|
|
||||||
"with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -309,20 +222,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
stop_words,
|
stop_words,
|
||||||
"stopWords",
|
"stopWords",
|
||||||
analytics,
|
StopWordsAnalytics
|
||||||
|stop_words: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"StopWords Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"stop_words": {
|
|
||||||
"total": stop_words.as_ref().map(|stop_words| stop_words.len()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -334,20 +234,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
non_separator_tokens,
|
non_separator_tokens,
|
||||||
"nonSeparatorTokens",
|
"nonSeparatorTokens",
|
||||||
analytics,
|
NonSeparatorTokensAnalytics
|
||||||
|non_separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"nonSeparatorTokens Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"non_separator_tokens": {
|
|
||||||
"total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -359,20 +246,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
separator_tokens,
|
separator_tokens,
|
||||||
"separatorTokens",
|
"separatorTokens",
|
||||||
analytics,
|
SeparatorTokensAnalytics
|
||||||
|separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"separatorTokens Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"separator_tokens": {
|
|
||||||
"total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -384,20 +258,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
dictionary,
|
dictionary,
|
||||||
"dictionary",
|
"dictionary",
|
||||||
analytics,
|
DictionaryAnalytics
|
||||||
|dictionary: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"dictionary Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"dictionary": {
|
|
||||||
"total": dictionary.as_ref().map(|dictionary| dictionary.len()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -409,20 +270,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
synonyms,
|
synonyms,
|
||||||
"synonyms",
|
"synonyms",
|
||||||
analytics,
|
SynonymsAnalytics
|
||||||
|synonyms: &Option<std::collections::BTreeMap<String, Vec<String>>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"Synonyms Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"synonyms": {
|
|
||||||
"total": synonyms.as_ref().map(|synonyms| synonyms.len()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -434,19 +282,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
distinct_attribute,
|
distinct_attribute,
|
||||||
"distinctAttribute",
|
"distinctAttribute",
|
||||||
analytics,
|
DistinctAttributeAnalytics
|
||||||
|distinct: &Option<String>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
analytics.publish(
|
|
||||||
"DistinctAttribute Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"distinct_attribute": {
|
|
||||||
"set": distinct.is_some(),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -458,20 +294,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
proximity_precision,
|
proximity_precision,
|
||||||
"proximityPrecision",
|
"proximityPrecision",
|
||||||
analytics,
|
ProximityPrecisionAnalytics
|
||||||
|precision: &Option<meilisearch_types::settings::ProximityPrecisionView>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
analytics.publish(
|
|
||||||
"ProximityPrecision Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"proximity_precision": {
|
|
||||||
"set": precision.is_some(),
|
|
||||||
"value": precision.unwrap_or_default(),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -483,17 +306,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
localized_attributes,
|
localized_attributes,
|
||||||
"localizedAttributes",
|
"localizedAttributes",
|
||||||
analytics,
|
LocalesAnalytics
|
||||||
|rules: &Option<Vec<meilisearch_types::locales::LocalizedAttributesRuleView>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
analytics.publish(
|
|
||||||
"LocalizedAttributesRules Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>())
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -505,26 +318,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
ranking_rules,
|
ranking_rules,
|
||||||
"rankingRules",
|
"rankingRules",
|
||||||
analytics,
|
RankingRulesAnalytics
|
||||||
|setting: &Option<Vec<meilisearch_types::settings::RankingRuleView>>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"RankingRules Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"ranking_rules": {
|
|
||||||
"words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))),
|
|
||||||
"typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))),
|
|
||||||
"proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))),
|
|
||||||
"attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))),
|
|
||||||
"sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))),
|
|
||||||
"exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))),
|
|
||||||
"values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -536,25 +330,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
faceting,
|
faceting,
|
||||||
"faceting",
|
"faceting",
|
||||||
analytics,
|
FacetingAnalytics
|
||||||
|setting: &Option<meilisearch_types::settings::FacetingSettings>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"Faceting Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"faceting": {
|
|
||||||
"max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
|
|
||||||
"sort_facet_values_by_star_count": setting.as_ref().and_then(|s| {
|
|
||||||
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
|
||||||
}),
|
|
||||||
"sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -566,20 +342,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
pagination,
|
pagination,
|
||||||
"pagination",
|
"pagination",
|
||||||
analytics,
|
PaginationAnalytics
|
||||||
|setting: &Option<meilisearch_types::settings::PaginationSettings>, req: &HttpRequest| {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"Pagination Updated".to_string(),
|
|
||||||
json!({
|
|
||||||
"pagination": {
|
|
||||||
"max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()),
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
@ -591,75 +354,9 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
embedders,
|
embedders,
|
||||||
"embedders",
|
"embedders",
|
||||||
analytics,
|
EmbeddersAnalytics
|
||||||
|setting: &Option<std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>>, req: &HttpRequest| {
|
|
||||||
|
|
||||||
|
|
||||||
analytics.publish(
|
|
||||||
"Embedders Updated".to_string(),
|
|
||||||
serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
fn embedder_analytics(
|
|
||||||
setting: Option<
|
|
||||||
&std::collections::BTreeMap<
|
|
||||||
String,
|
|
||||||
Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>,
|
|
||||||
>,
|
|
||||||
>,
|
|
||||||
) -> serde_json::Value {
|
|
||||||
let mut sources = std::collections::HashSet::new();
|
|
||||||
|
|
||||||
if let Some(s) = &setting {
|
|
||||||
for source in s
|
|
||||||
.values()
|
|
||||||
.filter_map(|config| config.clone().set())
|
|
||||||
.filter_map(|config| config.source.set())
|
|
||||||
{
|
|
||||||
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
|
||||||
match source {
|
|
||||||
EmbedderSource::OpenAi => sources.insert("openAi"),
|
|
||||||
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
|
|
||||||
EmbedderSource::UserProvided => sources.insert("userProvided"),
|
|
||||||
EmbedderSource::Ollama => sources.insert("ollama"),
|
|
||||||
EmbedderSource::Rest => sources.insert("rest"),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let document_template_used = setting.as_ref().map(|map| {
|
|
||||||
map.values()
|
|
||||||
.filter_map(|config| config.clone().set())
|
|
||||||
.any(|config| config.document_template.set().is_some())
|
|
||||||
});
|
|
||||||
|
|
||||||
let document_template_max_bytes = setting.as_ref().and_then(|map| {
|
|
||||||
map.values()
|
|
||||||
.filter_map(|config| config.clone().set())
|
|
||||||
.filter_map(|config| config.document_template_max_bytes.set())
|
|
||||||
.max()
|
|
||||||
});
|
|
||||||
|
|
||||||
let binary_quantization_used = setting.as_ref().map(|map| {
|
|
||||||
map.values()
|
|
||||||
.filter_map(|config| config.clone().set())
|
|
||||||
.any(|config| config.binary_quantized.set().is_some())
|
|
||||||
});
|
|
||||||
|
|
||||||
json!(
|
|
||||||
{
|
|
||||||
"total": setting.as_ref().map(|s| s.len()),
|
|
||||||
"sources": sources,
|
|
||||||
"document_template_used": document_template_used,
|
|
||||||
"document_template_max_bytes": document_template_max_bytes,
|
|
||||||
"binary_quantization_used": binary_quantization_used,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
make_setting_route!(
|
make_setting_route!(
|
||||||
"/search-cutoff-ms",
|
"/search-cutoff-ms",
|
||||||
put,
|
put,
|
||||||
@ -669,14 +366,7 @@ make_setting_route!(
|
|||||||
>,
|
>,
|
||||||
search_cutoff_ms,
|
search_cutoff_ms,
|
||||||
"searchCutoffMs",
|
"searchCutoffMs",
|
||||||
analytics,
|
SearchCutoffMsAnalytics
|
||||||
|setting: &Option<u64>, req: &HttpRequest| {
|
|
||||||
analytics.publish(
|
|
||||||
"Search Cutoff Updated".to_string(),
|
|
||||||
serde_json::json!({"search_cutoff_ms": setting }),
|
|
||||||
Some(req),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
macro_rules! generate_configure {
|
macro_rules! generate_configure {
|
||||||
@ -720,7 +410,7 @@ pub async fn update_all(
|
|||||||
body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
|
body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
@ -729,104 +419,45 @@ pub async fn update_all(
|
|||||||
let new_settings = validate_settings(new_settings, &index_scheduler)?;
|
let new_settings = validate_settings(new_settings, &index_scheduler)?;
|
||||||
|
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Settings Updated".to_string(),
|
SettingsAnalytics {
|
||||||
json!({
|
ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()),
|
||||||
"ranking_rules": {
|
searchable_attributes: SearchableAttributesAnalytics::new(
|
||||||
"words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))),
|
new_settings.searchable_attributes.as_ref().set(),
|
||||||
"typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))),
|
),
|
||||||
"proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))),
|
displayed_attributes: DisplayedAttributesAnalytics::new(
|
||||||
"attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))),
|
new_settings.displayed_attributes.as_ref().set(),
|
||||||
"sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))),
|
),
|
||||||
"exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))),
|
sortable_attributes: SortableAttributesAnalytics::new(
|
||||||
"values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
|
new_settings.sortable_attributes.as_ref().set(),
|
||||||
},
|
),
|
||||||
"searchable_attributes": {
|
filterable_attributes: FilterableAttributesAnalytics::new(
|
||||||
"total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()),
|
new_settings.filterable_attributes.as_ref().set(),
|
||||||
"with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
),
|
||||||
},
|
distinct_attribute: DistinctAttributeAnalytics::new(
|
||||||
"displayed_attributes": {
|
new_settings.distinct_attribute.as_ref().set(),
|
||||||
"total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()),
|
),
|
||||||
"with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
proximity_precision: ProximityPrecisionAnalytics::new(
|
||||||
},
|
new_settings.proximity_precision.as_ref().set(),
|
||||||
"sortable_attributes": {
|
),
|
||||||
"total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()),
|
typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()),
|
||||||
"has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")),
|
faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()),
|
||||||
},
|
pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()),
|
||||||
"filterable_attributes": {
|
stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()),
|
||||||
"total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()),
|
synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()),
|
||||||
"has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")),
|
embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()),
|
||||||
},
|
search_cutoff_ms: SearchCutoffMsAnalytics::new(
|
||||||
"distinct_attribute": {
|
new_settings.search_cutoff_ms.as_ref().set(),
|
||||||
"set": new_settings.distinct_attribute.as_ref().set().is_some()
|
),
|
||||||
},
|
locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()),
|
||||||
"proximity_precision": {
|
dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()),
|
||||||
"set": new_settings.proximity_precision.as_ref().set().is_some(),
|
separator_tokens: SeparatorTokensAnalytics::new(
|
||||||
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
|
new_settings.separator_tokens.as_ref().set(),
|
||||||
},
|
),
|
||||||
"typo_tolerance": {
|
non_separator_tokens: NonSeparatorTokensAnalytics::new(
|
||||||
"enabled": new_settings.typo_tolerance
|
new_settings.non_separator_tokens.as_ref().set(),
|
||||||
.as_ref()
|
),
|
||||||
.set()
|
},
|
||||||
.and_then(|s| s.enabled.as_ref().set())
|
&req,
|
||||||
.copied(),
|
|
||||||
"disable_on_attributes": new_settings.typo_tolerance
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
|
||||||
"disable_on_words": new_settings.typo_tolerance
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
|
||||||
"min_word_size_for_one_typo": new_settings.typo_tolerance
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.min_word_size_for_typos
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.map(|s| s.one_typo.set()))
|
|
||||||
.flatten(),
|
|
||||||
"min_word_size_for_two_typos": new_settings.typo_tolerance
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.min_word_size_for_typos
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.map(|s| s.two_typos.set()))
|
|
||||||
.flatten(),
|
|
||||||
},
|
|
||||||
"faceting": {
|
|
||||||
"max_values_per_facet": new_settings.faceting
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.max_values_per_facet.as_ref().set()),
|
|
||||||
"sort_facet_values_by_star_count": new_settings.faceting
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| {
|
|
||||||
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
|
||||||
}),
|
|
||||||
"sort_facet_values_by_total": new_settings.faceting
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
|
||||||
},
|
|
||||||
"pagination": {
|
|
||||||
"max_total_hits": new_settings.pagination
|
|
||||||
.as_ref()
|
|
||||||
.set()
|
|
||||||
.and_then(|s| s.max_total_hits.as_ref().set()),
|
|
||||||
},
|
|
||||||
"stop_words": {
|
|
||||||
"total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()),
|
|
||||||
},
|
|
||||||
"synonyms": {
|
|
||||||
"total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()),
|
|
||||||
},
|
|
||||||
"embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()),
|
|
||||||
"search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(),
|
|
||||||
"locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>()),
|
|
||||||
}),
|
|
||||||
Some(&req),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
|
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
|
||||||
|
622
crates/meilisearch/src/routes/indexes/settings_analytics.rs
Normal file
622
crates/meilisearch/src/routes/indexes/settings_analytics.rs
Normal file
@ -0,0 +1,622 @@
|
|||||||
|
//! All the structures used to make the analytics on the settings works.
|
||||||
|
//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received
|
||||||
|
//! through the sub-settings route directly without any manipulation.
|
||||||
|
//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`.
|
||||||
|
|
||||||
|
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||||
|
|
||||||
|
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
||||||
|
use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView};
|
||||||
|
use meilisearch_types::milli::update::Setting;
|
||||||
|
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
||||||
|
use meilisearch_types::settings::{
|
||||||
|
FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings,
|
||||||
|
};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::analytics::Aggregate;
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct SettingsAnalytics {
|
||||||
|
pub ranking_rules: RankingRulesAnalytics,
|
||||||
|
pub searchable_attributes: SearchableAttributesAnalytics,
|
||||||
|
pub displayed_attributes: DisplayedAttributesAnalytics,
|
||||||
|
pub sortable_attributes: SortableAttributesAnalytics,
|
||||||
|
pub filterable_attributes: FilterableAttributesAnalytics,
|
||||||
|
pub distinct_attribute: DistinctAttributeAnalytics,
|
||||||
|
pub proximity_precision: ProximityPrecisionAnalytics,
|
||||||
|
pub typo_tolerance: TypoToleranceAnalytics,
|
||||||
|
pub faceting: FacetingAnalytics,
|
||||||
|
pub pagination: PaginationAnalytics,
|
||||||
|
pub stop_words: StopWordsAnalytics,
|
||||||
|
pub synonyms: SynonymsAnalytics,
|
||||||
|
pub embedders: EmbeddersAnalytics,
|
||||||
|
pub search_cutoff_ms: SearchCutoffMsAnalytics,
|
||||||
|
pub locales: LocalesAnalytics,
|
||||||
|
pub dictionary: DictionaryAnalytics,
|
||||||
|
pub separator_tokens: SeparatorTokensAnalytics,
|
||||||
|
pub non_separator_tokens: NonSeparatorTokensAnalytics,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for SettingsAnalytics {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Settings Updated"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
ranking_rules: RankingRulesAnalytics {
|
||||||
|
words_position: new
|
||||||
|
.ranking_rules
|
||||||
|
.words_position
|
||||||
|
.or(self.ranking_rules.words_position),
|
||||||
|
typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position),
|
||||||
|
proximity_position: new
|
||||||
|
.ranking_rules
|
||||||
|
.proximity_position
|
||||||
|
.or(self.ranking_rules.proximity_position),
|
||||||
|
attribute_position: new
|
||||||
|
.ranking_rules
|
||||||
|
.attribute_position
|
||||||
|
.or(self.ranking_rules.attribute_position),
|
||||||
|
sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position),
|
||||||
|
exactness_position: new
|
||||||
|
.ranking_rules
|
||||||
|
.exactness_position
|
||||||
|
.or(self.ranking_rules.exactness_position),
|
||||||
|
values: new.ranking_rules.values.or(self.ranking_rules.values),
|
||||||
|
},
|
||||||
|
searchable_attributes: SearchableAttributesAnalytics {
|
||||||
|
total: new.searchable_attributes.total.or(self.searchable_attributes.total),
|
||||||
|
with_wildcard: new
|
||||||
|
.searchable_attributes
|
||||||
|
.with_wildcard
|
||||||
|
.or(self.searchable_attributes.with_wildcard),
|
||||||
|
},
|
||||||
|
displayed_attributes: DisplayedAttributesAnalytics {
|
||||||
|
total: new.displayed_attributes.total.or(self.displayed_attributes.total),
|
||||||
|
with_wildcard: new
|
||||||
|
.displayed_attributes
|
||||||
|
.with_wildcard
|
||||||
|
.or(self.displayed_attributes.with_wildcard),
|
||||||
|
},
|
||||||
|
sortable_attributes: SortableAttributesAnalytics {
|
||||||
|
total: new.sortable_attributes.total.or(self.sortable_attributes.total),
|
||||||
|
has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo),
|
||||||
|
},
|
||||||
|
filterable_attributes: FilterableAttributesAnalytics {
|
||||||
|
total: new.filterable_attributes.total.or(self.filterable_attributes.total),
|
||||||
|
has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo),
|
||||||
|
},
|
||||||
|
distinct_attribute: DistinctAttributeAnalytics {
|
||||||
|
set: self.distinct_attribute.set | new.distinct_attribute.set,
|
||||||
|
},
|
||||||
|
proximity_precision: ProximityPrecisionAnalytics {
|
||||||
|
set: self.proximity_precision.set | new.proximity_precision.set,
|
||||||
|
value: new.proximity_precision.value.or(self.proximity_precision.value),
|
||||||
|
},
|
||||||
|
typo_tolerance: TypoToleranceAnalytics {
|
||||||
|
enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled),
|
||||||
|
disable_on_attributes: new
|
||||||
|
.typo_tolerance
|
||||||
|
.disable_on_attributes
|
||||||
|
.or(self.typo_tolerance.disable_on_attributes),
|
||||||
|
disable_on_words: new
|
||||||
|
.typo_tolerance
|
||||||
|
.disable_on_words
|
||||||
|
.or(self.typo_tolerance.disable_on_words),
|
||||||
|
min_word_size_for_one_typo: new
|
||||||
|
.typo_tolerance
|
||||||
|
.min_word_size_for_one_typo
|
||||||
|
.or(self.typo_tolerance.min_word_size_for_one_typo),
|
||||||
|
min_word_size_for_two_typos: new
|
||||||
|
.typo_tolerance
|
||||||
|
.min_word_size_for_two_typos
|
||||||
|
.or(self.typo_tolerance.min_word_size_for_two_typos),
|
||||||
|
},
|
||||||
|
faceting: FacetingAnalytics {
|
||||||
|
max_values_per_facet: new
|
||||||
|
.faceting
|
||||||
|
.max_values_per_facet
|
||||||
|
.or(self.faceting.max_values_per_facet),
|
||||||
|
sort_facet_values_by_star_count: new
|
||||||
|
.faceting
|
||||||
|
.sort_facet_values_by_star_count
|
||||||
|
.or(self.faceting.sort_facet_values_by_star_count),
|
||||||
|
sort_facet_values_by_total: new
|
||||||
|
.faceting
|
||||||
|
.sort_facet_values_by_total
|
||||||
|
.or(self.faceting.sort_facet_values_by_total),
|
||||||
|
},
|
||||||
|
pagination: PaginationAnalytics {
|
||||||
|
max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits),
|
||||||
|
},
|
||||||
|
stop_words: StopWordsAnalytics {
|
||||||
|
total: new.stop_words.total.or(self.stop_words.total),
|
||||||
|
},
|
||||||
|
synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) },
|
||||||
|
embedders: EmbeddersAnalytics {
|
||||||
|
total: new.embedders.total.or(self.embedders.total),
|
||||||
|
sources: match (self.embedders.sources, new.embedders.sources) {
|
||||||
|
(None, None) => None,
|
||||||
|
(Some(sources), None) | (None, Some(sources)) => Some(sources),
|
||||||
|
(Some(this), Some(other)) => Some(this.union(&other).cloned().collect()),
|
||||||
|
},
|
||||||
|
document_template_used: match (
|
||||||
|
self.embedders.document_template_used,
|
||||||
|
new.embedders.document_template_used,
|
||||||
|
) {
|
||||||
|
(None, None) => None,
|
||||||
|
(Some(used), None) | (None, Some(used)) => Some(used),
|
||||||
|
(Some(this), Some(other)) => Some(this | other),
|
||||||
|
},
|
||||||
|
document_template_max_bytes: match (
|
||||||
|
self.embedders.document_template_max_bytes,
|
||||||
|
new.embedders.document_template_max_bytes,
|
||||||
|
) {
|
||||||
|
(None, None) => None,
|
||||||
|
(Some(bytes), None) | (None, Some(bytes)) => Some(bytes),
|
||||||
|
(Some(this), Some(other)) => Some(this.max(other)),
|
||||||
|
},
|
||||||
|
binary_quantization_used: match (
|
||||||
|
self.embedders.binary_quantization_used,
|
||||||
|
new.embedders.binary_quantization_used,
|
||||||
|
) {
|
||||||
|
(None, None) => None,
|
||||||
|
(Some(bq), None) | (None, Some(bq)) => Some(bq),
|
||||||
|
(Some(this), Some(other)) => Some(this | other),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
search_cutoff_ms: SearchCutoffMsAnalytics {
|
||||||
|
search_cutoff_ms: new
|
||||||
|
.search_cutoff_ms
|
||||||
|
.search_cutoff_ms
|
||||||
|
.or(self.search_cutoff_ms.search_cutoff_ms),
|
||||||
|
},
|
||||||
|
locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) },
|
||||||
|
dictionary: DictionaryAnalytics {
|
||||||
|
total: new.dictionary.total.or(self.dictionary.total),
|
||||||
|
},
|
||||||
|
separator_tokens: SeparatorTokensAnalytics {
|
||||||
|
total: new.non_separator_tokens.total.or(self.separator_tokens.total),
|
||||||
|
},
|
||||||
|
non_separator_tokens: NonSeparatorTokensAnalytics {
|
||||||
|
total: new.non_separator_tokens.total.or(self.non_separator_tokens.total),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct RankingRulesAnalytics {
|
||||||
|
pub words_position: Option<usize>,
|
||||||
|
pub typo_position: Option<usize>,
|
||||||
|
pub proximity_position: Option<usize>,
|
||||||
|
pub attribute_position: Option<usize>,
|
||||||
|
pub sort_position: Option<usize>,
|
||||||
|
pub exactness_position: Option<usize>,
|
||||||
|
pub values: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RankingRulesAnalytics {
|
||||||
|
pub fn new(rr: Option<&Vec<RankingRuleView>>) -> Self {
|
||||||
|
RankingRulesAnalytics {
|
||||||
|
words_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter()
|
||||||
|
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))
|
||||||
|
}),
|
||||||
|
typo_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter()
|
||||||
|
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))
|
||||||
|
}),
|
||||||
|
proximity_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter().position(|s| {
|
||||||
|
matches!(s, meilisearch_types::settings::RankingRuleView::Proximity)
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
attribute_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter().position(|s| {
|
||||||
|
matches!(s, meilisearch_types::settings::RankingRuleView::Attribute)
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
sort_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter()
|
||||||
|
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))
|
||||||
|
}),
|
||||||
|
exactness_position: rr.as_ref().and_then(|rr| {
|
||||||
|
rr.iter().position(|s| {
|
||||||
|
matches!(s, meilisearch_types::settings::RankingRuleView::Exactness)
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
values: rr.as_ref().map(|rr| {
|
||||||
|
rr.iter()
|
||||||
|
.filter(|s| {
|
||||||
|
matches!(
|
||||||
|
s,
|
||||||
|
meilisearch_types::settings::RankingRuleView::Asc(_)
|
||||||
|
| meilisearch_types::settings::RankingRuleView::Desc(_)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.map(|x| x.to_string())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(", ")
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { ranking_rules: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct SearchableAttributesAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
pub with_wildcard: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearchableAttributesAnalytics {
|
||||||
|
pub fn new(setting: Option<&Vec<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
total: setting.as_ref().map(|searchable| searchable.len()),
|
||||||
|
with_wildcard: setting
|
||||||
|
.as_ref()
|
||||||
|
.map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { searchable_attributes: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct DisplayedAttributesAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
pub with_wildcard: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DisplayedAttributesAnalytics {
|
||||||
|
pub fn new(displayed: Option<&Vec<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
total: displayed.as_ref().map(|displayed| displayed.len()),
|
||||||
|
with_wildcard: displayed
|
||||||
|
.as_ref()
|
||||||
|
.map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { displayed_attributes: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct SortableAttributesAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
pub has_geo: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SortableAttributesAnalytics {
|
||||||
|
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
total: setting.as_ref().map(|sort| sort.len()),
|
||||||
|
has_geo: setting.as_ref().map(|sort| sort.contains("_geo")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { sortable_attributes: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct FilterableAttributesAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
pub has_geo: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FilterableAttributesAnalytics {
|
||||||
|
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
total: setting.as_ref().map(|filter| filter.len()),
|
||||||
|
has_geo: setting.as_ref().map(|filter| filter.contains("_geo")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { filterable_attributes: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct DistinctAttributeAnalytics {
|
||||||
|
pub set: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DistinctAttributeAnalytics {
|
||||||
|
pub fn new(distinct: Option<&String>) -> Self {
|
||||||
|
Self { set: distinct.is_some() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { distinct_attribute: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct ProximityPrecisionAnalytics {
|
||||||
|
pub set: bool,
|
||||||
|
pub value: Option<ProximityPrecisionView>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ProximityPrecisionAnalytics {
|
||||||
|
pub fn new(precision: Option<&ProximityPrecisionView>) -> Self {
|
||||||
|
Self { set: precision.is_some(), value: precision.cloned() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { proximity_precision: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct TypoToleranceAnalytics {
|
||||||
|
pub enabled: Option<bool>,
|
||||||
|
pub disable_on_attributes: Option<bool>,
|
||||||
|
pub disable_on_words: Option<bool>,
|
||||||
|
pub min_word_size_for_one_typo: Option<u8>,
|
||||||
|
pub min_word_size_for_two_typos: Option<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TypoToleranceAnalytics {
|
||||||
|
pub fn new(setting: Option<&TypoSettings>) -> Self {
|
||||||
|
Self {
|
||||||
|
enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
|
||||||
|
disable_on_attributes: setting
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
||||||
|
disable_on_words: setting
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
||||||
|
min_word_size_for_one_typo: setting
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set()))
|
||||||
|
.flatten(),
|
||||||
|
min_word_size_for_two_typos: setting
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set()))
|
||||||
|
.flatten(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { typo_tolerance: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct FacetingAnalytics {
|
||||||
|
pub max_values_per_facet: Option<usize>,
|
||||||
|
pub sort_facet_values_by_star_count: Option<bool>,
|
||||||
|
pub sort_facet_values_by_total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetingAnalytics {
|
||||||
|
pub fn new(setting: Option<&FacetingSettings>) -> Self {
|
||||||
|
Self {
|
||||||
|
max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
|
||||||
|
sort_facet_values_by_star_count: setting.as_ref().and_then(|s| {
|
||||||
|
s.sort_facet_values_by
|
||||||
|
.as_ref()
|
||||||
|
.set()
|
||||||
|
.map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
||||||
|
}),
|
||||||
|
sort_facet_values_by_total: setting
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { faceting: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct PaginationAnalytics {
|
||||||
|
pub max_total_hits: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PaginationAnalytics {
|
||||||
|
pub fn new(setting: Option<&PaginationSettings>) -> Self {
|
||||||
|
Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { pagination: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct StopWordsAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StopWordsAnalytics {
|
||||||
|
pub fn new(stop_words: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { stop_words: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct SynonymsAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SynonymsAnalytics {
|
||||||
|
pub fn new(synonyms: Option<&BTreeMap<String, Vec<String>>>) -> Self {
|
||||||
|
Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { synonyms: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct EmbeddersAnalytics {
|
||||||
|
// last
|
||||||
|
pub total: Option<usize>,
|
||||||
|
// Merge the sources
|
||||||
|
pub sources: Option<HashSet<String>>,
|
||||||
|
// |=
|
||||||
|
pub document_template_used: Option<bool>,
|
||||||
|
// max
|
||||||
|
pub document_template_max_bytes: Option<usize>,
|
||||||
|
// |=
|
||||||
|
pub binary_quantization_used: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EmbeddersAnalytics {
|
||||||
|
pub fn new(setting: Option<&BTreeMap<String, Setting<EmbeddingSettings>>>) -> Self {
|
||||||
|
let mut sources = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
if let Some(s) = &setting {
|
||||||
|
for source in s
|
||||||
|
.values()
|
||||||
|
.filter_map(|config| config.clone().set())
|
||||||
|
.filter_map(|config| config.source.set())
|
||||||
|
{
|
||||||
|
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
||||||
|
match source {
|
||||||
|
EmbedderSource::OpenAi => sources.insert("openAi".to_string()),
|
||||||
|
EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()),
|
||||||
|
EmbedderSource::UserProvided => sources.insert("userProvided".to_string()),
|
||||||
|
EmbedderSource::Ollama => sources.insert("ollama".to_string()),
|
||||||
|
EmbedderSource::Rest => sources.insert("rest".to_string()),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
total: setting.as_ref().map(|s| s.len()),
|
||||||
|
sources: Some(sources),
|
||||||
|
document_template_used: setting.as_ref().map(|map| {
|
||||||
|
map.values()
|
||||||
|
.filter_map(|config| config.clone().set())
|
||||||
|
.any(|config| config.document_template.set().is_some())
|
||||||
|
}),
|
||||||
|
document_template_max_bytes: setting.as_ref().and_then(|map| {
|
||||||
|
map.values()
|
||||||
|
.filter_map(|config| config.clone().set())
|
||||||
|
.filter_map(|config| config.document_template_max_bytes.set())
|
||||||
|
.max()
|
||||||
|
}),
|
||||||
|
binary_quantization_used: setting.as_ref().map(|map| {
|
||||||
|
map.values()
|
||||||
|
.filter_map(|config| config.clone().set())
|
||||||
|
.any(|config| config.binary_quantized.set().is_some())
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { embedders: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
#[serde(transparent)]
|
||||||
|
pub struct SearchCutoffMsAnalytics {
|
||||||
|
pub search_cutoff_ms: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearchCutoffMsAnalytics {
|
||||||
|
pub fn new(setting: Option<&u64>) -> Self {
|
||||||
|
Self { search_cutoff_ms: setting.copied() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { search_cutoff_ms: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
#[serde(transparent)]
|
||||||
|
pub struct LocalesAnalytics {
|
||||||
|
pub locales: Option<BTreeSet<Locale>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LocalesAnalytics {
|
||||||
|
pub fn new(rules: Option<&Vec<LocalizedAttributesRuleView>>) -> Self {
|
||||||
|
LocalesAnalytics {
|
||||||
|
locales: rules.as_ref().map(|rules| {
|
||||||
|
rules
|
||||||
|
.iter()
|
||||||
|
.flat_map(|rule| rule.locales.iter().cloned())
|
||||||
|
.collect::<std::collections::BTreeSet<_>>()
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { locales: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct DictionaryAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DictionaryAnalytics {
|
||||||
|
pub fn new(dictionary: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { dictionary: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct SeparatorTokensAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SeparatorTokensAnalytics {
|
||||||
|
pub fn new(separator_tokens: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { separator_tokens: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Default)]
|
||||||
|
pub struct NonSeparatorTokensAnalytics {
|
||||||
|
pub total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NonSeparatorTokensAnalytics {
|
||||||
|
pub fn new(non_separator_tokens: Option<&BTreeSet<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
total: non_separator_tokens
|
||||||
|
.as_ref()
|
||||||
|
.map(|non_separator_tokens| non_separator_tokens.len()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_settings(self) -> SettingsAnalytics {
|
||||||
|
SettingsAnalytics { non_separator_tokens: self, ..Default::default() }
|
||||||
|
}
|
||||||
|
}
|
@ -13,9 +13,10 @@ use serde_json::Value;
|
|||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::ActionPolicy;
|
use super::ActionPolicy;
|
||||||
use crate::analytics::{Analytics, SimilarAggregator};
|
use crate::analytics::Analytics;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
|
use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST};
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
|
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
|
||||||
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||||
@ -34,13 +35,13 @@ pub async fn similar_get(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebQueryParameter<SimilarQueryGet, DeserrQueryParamError>,
|
params: AwebQueryParameter<SimilarQueryGet, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
let query = params.0.try_into()?;
|
let query = params.0.try_into()?;
|
||||||
|
|
||||||
let mut aggregate = SimilarAggregator::from_query(&query, &req);
|
let mut aggregate = SimilarAggregator::<SimilarGET>::from_query(&query);
|
||||||
|
|
||||||
debug!(parameters = ?query, "Similar get");
|
debug!(parameters = ?query, "Similar get");
|
||||||
|
|
||||||
@ -49,7 +50,7 @@ pub async fn similar_get(
|
|||||||
if let Ok(similar) = &similar {
|
if let Ok(similar) = &similar {
|
||||||
aggregate.succeed(similar);
|
aggregate.succeed(similar);
|
||||||
}
|
}
|
||||||
analytics.get_similar(aggregate);
|
analytics.publish(aggregate, &req);
|
||||||
|
|
||||||
let similar = similar?;
|
let similar = similar?;
|
||||||
|
|
||||||
@ -62,21 +63,21 @@ pub async fn similar_post(
|
|||||||
index_uid: web::Path<String>,
|
index_uid: web::Path<String>,
|
||||||
params: AwebJson<SimilarQuery, DeserrJsonError>,
|
params: AwebJson<SimilarQuery, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
|
|
||||||
let query = params.into_inner();
|
let query = params.into_inner();
|
||||||
debug!(parameters = ?query, "Similar post");
|
debug!(parameters = ?query, "Similar post");
|
||||||
|
|
||||||
let mut aggregate = SimilarAggregator::from_query(&query, &req);
|
let mut aggregate = SimilarAggregator::<SimilarPOST>::from_query(&query);
|
||||||
|
|
||||||
let similar = similar(index_scheduler, index_uid, query).await;
|
let similar = similar(index_scheduler, index_uid, query).await;
|
||||||
|
|
||||||
if let Ok(similar) = &similar {
|
if let Ok(similar) = &similar {
|
||||||
aggregate.succeed(similar);
|
aggregate.succeed(similar);
|
||||||
}
|
}
|
||||||
analytics.post_similar(aggregate);
|
analytics.publish(aggregate, &req);
|
||||||
|
|
||||||
let similar = similar?;
|
let similar = similar?;
|
||||||
|
|
||||||
|
233
crates/meilisearch/src/routes/indexes/similar_analytics.rs
Normal file
233
crates/meilisearch/src/routes/indexes/similar_analytics.rs
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use crate::aggregate_methods;
|
||||||
|
use crate::analytics::{Aggregate, AggregateMethod};
|
||||||
|
use crate::search::{SimilarQuery, SimilarResult};
|
||||||
|
|
||||||
|
aggregate_methods!(
|
||||||
|
SimilarPOST => "Similar POST",
|
||||||
|
SimilarGET => "Similar GET",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct SimilarAggregator<Method: AggregateMethod> {
|
||||||
|
// requests
|
||||||
|
total_received: usize,
|
||||||
|
total_succeeded: usize,
|
||||||
|
time_spent: BinaryHeap<usize>,
|
||||||
|
|
||||||
|
// filter
|
||||||
|
filter_with_geo_radius: bool,
|
||||||
|
filter_with_geo_bounding_box: bool,
|
||||||
|
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||||
|
filter_sum_of_criteria_terms: usize,
|
||||||
|
// every time a request has a filter, this field must be incremented by one
|
||||||
|
filter_total_number_of_criteria: usize,
|
||||||
|
used_syntax: HashMap<String, usize>,
|
||||||
|
|
||||||
|
// Whether a non-default embedder was specified
|
||||||
|
retrieve_vectors: bool,
|
||||||
|
|
||||||
|
// pagination
|
||||||
|
max_limit: usize,
|
||||||
|
max_offset: usize,
|
||||||
|
|
||||||
|
// formatting
|
||||||
|
max_attributes_to_retrieve: usize,
|
||||||
|
|
||||||
|
// scoring
|
||||||
|
show_ranking_score: bool,
|
||||||
|
show_ranking_score_details: bool,
|
||||||
|
ranking_score_threshold: bool,
|
||||||
|
|
||||||
|
marker: std::marker::PhantomData<Method>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> SimilarAggregator<Method> {
|
||||||
|
#[allow(clippy::field_reassign_with_default)]
|
||||||
|
pub fn from_query(query: &SimilarQuery) -> Self {
|
||||||
|
let SimilarQuery {
|
||||||
|
id: _,
|
||||||
|
embedder: _,
|
||||||
|
offset,
|
||||||
|
limit,
|
||||||
|
attributes_to_retrieve: _,
|
||||||
|
retrieve_vectors,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
filter,
|
||||||
|
ranking_score_threshold,
|
||||||
|
} = query;
|
||||||
|
|
||||||
|
let mut ret = Self::default();
|
||||||
|
|
||||||
|
ret.total_received = 1;
|
||||||
|
|
||||||
|
if let Some(ref filter) = filter {
|
||||||
|
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
||||||
|
ret.filter_total_number_of_criteria = 1;
|
||||||
|
|
||||||
|
let syntax = match filter {
|
||||||
|
Value::String(_) => "string".to_string(),
|
||||||
|
Value::Array(values) => {
|
||||||
|
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
|
||||||
|
"mixed".to_string()
|
||||||
|
} else {
|
||||||
|
"array".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => "none".to_string(),
|
||||||
|
};
|
||||||
|
// convert the string to a HashMap
|
||||||
|
ret.used_syntax.insert(syntax, 1);
|
||||||
|
|
||||||
|
let stringified_filters = filter.to_string();
|
||||||
|
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
|
||||||
|
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
|
||||||
|
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.max_limit = *limit;
|
||||||
|
ret.max_offset = *offset;
|
||||||
|
|
||||||
|
ret.show_ranking_score = *show_ranking_score;
|
||||||
|
ret.show_ranking_score_details = *show_ranking_score_details;
|
||||||
|
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||||
|
|
||||||
|
ret.retrieve_vectors = *retrieve_vectors;
|
||||||
|
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn succeed(&mut self, result: &SimilarResult) {
|
||||||
|
let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result;
|
||||||
|
|
||||||
|
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||||
|
|
||||||
|
self.time_spent.push(*processing_time_ms as usize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod> Aggregate for SimilarAggregator<Method> {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
Method::event_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Aggregate one [SimilarAggregator] into another.
|
||||||
|
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
mut time_spent,
|
||||||
|
filter_with_geo_radius,
|
||||||
|
filter_with_geo_bounding_box,
|
||||||
|
filter_sum_of_criteria_terms,
|
||||||
|
filter_total_number_of_criteria,
|
||||||
|
used_syntax,
|
||||||
|
max_limit,
|
||||||
|
max_offset,
|
||||||
|
max_attributes_to_retrieve,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
ranking_score_threshold,
|
||||||
|
retrieve_vectors,
|
||||||
|
marker: _,
|
||||||
|
} = *new;
|
||||||
|
|
||||||
|
// request
|
||||||
|
self.total_received = self.total_received.saturating_add(total_received);
|
||||||
|
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
||||||
|
self.time_spent.append(&mut time_spent);
|
||||||
|
|
||||||
|
// filter
|
||||||
|
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||||
|
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||||
|
self.filter_sum_of_criteria_terms =
|
||||||
|
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
|
||||||
|
self.filter_total_number_of_criteria =
|
||||||
|
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
|
||||||
|
for (key, value) in used_syntax.into_iter() {
|
||||||
|
let used_syntax = self.used_syntax.entry(key).or_insert(0);
|
||||||
|
*used_syntax = used_syntax.saturating_add(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.retrieve_vectors |= retrieve_vectors;
|
||||||
|
|
||||||
|
// pagination
|
||||||
|
self.max_limit = self.max_limit.max(max_limit);
|
||||||
|
self.max_offset = self.max_offset.max(max_offset);
|
||||||
|
|
||||||
|
// formatting
|
||||||
|
self.max_attributes_to_retrieve =
|
||||||
|
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
|
||||||
|
|
||||||
|
// scoring
|
||||||
|
self.show_ranking_score |= show_ranking_score;
|
||||||
|
self.show_ranking_score_details |= show_ranking_score_details;
|
||||||
|
self.ranking_score_threshold |= ranking_score_threshold;
|
||||||
|
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
time_spent,
|
||||||
|
filter_with_geo_radius,
|
||||||
|
filter_with_geo_bounding_box,
|
||||||
|
filter_sum_of_criteria_terms,
|
||||||
|
filter_total_number_of_criteria,
|
||||||
|
used_syntax,
|
||||||
|
max_limit,
|
||||||
|
max_offset,
|
||||||
|
max_attributes_to_retrieve,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
ranking_score_threshold,
|
||||||
|
retrieve_vectors,
|
||||||
|
marker: _,
|
||||||
|
} = *self;
|
||||||
|
|
||||||
|
// we get all the values in a sorted manner
|
||||||
|
let time_spent = time_spent.into_sorted_vec();
|
||||||
|
// the index of the 99th percentage of value
|
||||||
|
let percentile_99th = time_spent.len() * 99 / 100;
|
||||||
|
// We are only interested by the slowest value of the 99th fastest results
|
||||||
|
let time_spent = time_spent.get(percentile_99th);
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"requests": {
|
||||||
|
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||||
|
"total_succeeded": total_succeeded,
|
||||||
|
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||||
|
"total_received": total_received,
|
||||||
|
},
|
||||||
|
"filter": {
|
||||||
|
"with_geoRadius": filter_with_geo_radius,
|
||||||
|
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||||
|
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||||
|
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"retrieve_vectors": retrieve_vectors,
|
||||||
|
},
|
||||||
|
"pagination": {
|
||||||
|
"max_limit": max_limit,
|
||||||
|
"max_offset": max_offset,
|
||||||
|
},
|
||||||
|
"formatting": {
|
||||||
|
"max_attributes_to_retrieve": max_attributes_to_retrieve,
|
||||||
|
},
|
||||||
|
"scoring": {
|
||||||
|
"show_ranking_score": show_ranking_score,
|
||||||
|
"show_ranking_score_details": show_ranking_score_details,
|
||||||
|
"ranking_score_threshold": ranking_score_threshold,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -25,6 +25,7 @@ pub mod indexes;
|
|||||||
mod logs;
|
mod logs;
|
||||||
mod metrics;
|
mod metrics;
|
||||||
mod multi_search;
|
mod multi_search;
|
||||||
|
mod multi_search_analytics;
|
||||||
mod snapshot;
|
mod snapshot;
|
||||||
mod swap_indexes;
|
mod swap_indexes;
|
||||||
pub mod tasks;
|
pub mod tasks;
|
||||||
|
@ -9,7 +9,8 @@ use meilisearch_types::keys::actions;
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::{Analytics, MultiSearchAggregator};
|
use super::multi_search_analytics::MultiSearchAggregator;
|
||||||
|
use crate::analytics::Analytics;
|
||||||
use crate::error::MeilisearchHttpError;
|
use crate::error::MeilisearchHttpError;
|
||||||
use crate::extractors::authentication::policies::ActionPolicy;
|
use crate::extractors::authentication::policies::ActionPolicy;
|
||||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||||
@ -35,7 +36,7 @@ pub async fn multi_search_with_post(
|
|||||||
search_queue: Data<SearchQueue>,
|
search_queue: Data<SearchQueue>,
|
||||||
params: AwebJson<FederatedSearch, DeserrJsonError>,
|
params: AwebJson<FederatedSearch, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
// Since we don't want to process half of the search requests and then get a permit refused
|
// Since we don't want to process half of the search requests and then get a permit refused
|
||||||
// we're going to get one permit for the whole duration of the multi-search request.
|
// we're going to get one permit for the whole duration of the multi-search request.
|
||||||
@ -43,7 +44,7 @@ pub async fn multi_search_with_post(
|
|||||||
|
|
||||||
let federated_search = params.into_inner();
|
let federated_search = params.into_inner();
|
||||||
|
|
||||||
let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req);
|
let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search);
|
||||||
|
|
||||||
let FederatedSearch { mut queries, federation } = federated_search;
|
let FederatedSearch { mut queries, federation } = federated_search;
|
||||||
|
|
||||||
@ -87,7 +88,7 @@ pub async fn multi_search_with_post(
|
|||||||
multi_aggregate.succeed();
|
multi_aggregate.succeed();
|
||||||
}
|
}
|
||||||
|
|
||||||
analytics.post_multi_search(multi_aggregate);
|
analytics.publish(multi_aggregate, &req);
|
||||||
HttpResponse::Ok().json(search_result??)
|
HttpResponse::Ok().json(search_result??)
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
@ -149,7 +150,7 @@ pub async fn multi_search_with_post(
|
|||||||
if search_results.is_ok() {
|
if search_results.is_ok() {
|
||||||
multi_aggregate.succeed();
|
multi_aggregate.succeed();
|
||||||
}
|
}
|
||||||
analytics.post_multi_search(multi_aggregate);
|
analytics.publish(multi_aggregate, &req);
|
||||||
|
|
||||||
let search_results = search_results.map_err(|(mut err, query_index)| {
|
let search_results = search_results.map_err(|(mut err, query_index)| {
|
||||||
// Add the query index that failed as context for the error message.
|
// Add the query index that failed as context for the error message.
|
||||||
|
168
crates/meilisearch/src/routes/multi_search_analytics.rs
Normal file
168
crates/meilisearch/src/routes/multi_search_analytics.rs
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
use crate::analytics::Aggregate;
|
||||||
|
use crate::search::{FederatedSearch, SearchQueryWithIndex};
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct MultiSearchAggregator {
|
||||||
|
// requests
|
||||||
|
total_received: usize,
|
||||||
|
total_succeeded: usize,
|
||||||
|
|
||||||
|
// sum of the number of distinct indexes in each single request, use with total_received to compute an avg
|
||||||
|
total_distinct_index_count: usize,
|
||||||
|
// number of queries with a single index, use with total_received to compute a proportion
|
||||||
|
total_single_index: usize,
|
||||||
|
|
||||||
|
// sum of the number of search queries in the requests, use with total_received to compute an average
|
||||||
|
total_search_count: usize,
|
||||||
|
|
||||||
|
// scoring
|
||||||
|
show_ranking_score: bool,
|
||||||
|
show_ranking_score_details: bool,
|
||||||
|
|
||||||
|
// federation
|
||||||
|
use_federation: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MultiSearchAggregator {
|
||||||
|
pub fn from_federated_search(federated_search: &FederatedSearch) -> Self {
|
||||||
|
let use_federation = federated_search.federation.is_some();
|
||||||
|
|
||||||
|
let distinct_indexes: HashSet<_> = federated_search
|
||||||
|
.queries
|
||||||
|
.iter()
|
||||||
|
.map(|query| {
|
||||||
|
let query = &query;
|
||||||
|
// make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex
|
||||||
|
let SearchQueryWithIndex {
|
||||||
|
index_uid,
|
||||||
|
federation_options: _,
|
||||||
|
q: _,
|
||||||
|
vector: _,
|
||||||
|
offset: _,
|
||||||
|
limit: _,
|
||||||
|
page: _,
|
||||||
|
hits_per_page: _,
|
||||||
|
attributes_to_retrieve: _,
|
||||||
|
retrieve_vectors: _,
|
||||||
|
attributes_to_crop: _,
|
||||||
|
crop_length: _,
|
||||||
|
attributes_to_highlight: _,
|
||||||
|
show_ranking_score: _,
|
||||||
|
show_ranking_score_details: _,
|
||||||
|
show_matches_position: _,
|
||||||
|
filter: _,
|
||||||
|
sort: _,
|
||||||
|
distinct: _,
|
||||||
|
facets: _,
|
||||||
|
highlight_pre_tag: _,
|
||||||
|
highlight_post_tag: _,
|
||||||
|
crop_marker: _,
|
||||||
|
matching_strategy: _,
|
||||||
|
attributes_to_search_on: _,
|
||||||
|
hybrid: _,
|
||||||
|
ranking_score_threshold: _,
|
||||||
|
locales: _,
|
||||||
|
} = query;
|
||||||
|
|
||||||
|
index_uid.as_str()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let show_ranking_score =
|
||||||
|
federated_search.queries.iter().any(|query| query.show_ranking_score);
|
||||||
|
let show_ranking_score_details =
|
||||||
|
federated_search.queries.iter().any(|query| query.show_ranking_score_details);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
total_received: 1,
|
||||||
|
total_succeeded: 0,
|
||||||
|
total_distinct_index_count: distinct_indexes.len(),
|
||||||
|
total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 },
|
||||||
|
total_search_count: federated_search.queries.len(),
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
use_federation,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn succeed(&mut self) {
|
||||||
|
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for MultiSearchAggregator {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Documents Searched by Multi-Search POST"
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Aggregate one [MultiSearchAggregator] into another.
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
// write the aggregate in a way that will cause a compilation error if a field is added.
|
||||||
|
|
||||||
|
// get ownership of self, replacing it by a default value.
|
||||||
|
let this = *self;
|
||||||
|
|
||||||
|
let total_received = this.total_received.saturating_add(new.total_received);
|
||||||
|
let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded);
|
||||||
|
let total_distinct_index_count =
|
||||||
|
this.total_distinct_index_count.saturating_add(new.total_distinct_index_count);
|
||||||
|
let total_single_index = this.total_single_index.saturating_add(new.total_single_index);
|
||||||
|
let total_search_count = this.total_search_count.saturating_add(new.total_search_count);
|
||||||
|
let show_ranking_score = this.show_ranking_score || new.show_ranking_score;
|
||||||
|
let show_ranking_score_details =
|
||||||
|
this.show_ranking_score_details || new.show_ranking_score_details;
|
||||||
|
let use_federation = this.use_federation || new.use_federation;
|
||||||
|
|
||||||
|
Box::new(Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
total_distinct_index_count,
|
||||||
|
total_single_index,
|
||||||
|
total_search_count,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
use_federation,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
let Self {
|
||||||
|
total_received,
|
||||||
|
total_succeeded,
|
||||||
|
total_distinct_index_count,
|
||||||
|
total_single_index,
|
||||||
|
total_search_count,
|
||||||
|
show_ranking_score,
|
||||||
|
show_ranking_score_details,
|
||||||
|
use_federation,
|
||||||
|
} = *self;
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"requests": {
|
||||||
|
"total_succeeded": total_succeeded,
|
||||||
|
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||||
|
"total_received": total_received,
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"total_single_index": total_single_index,
|
||||||
|
"total_distinct_index_count": total_distinct_index_count,
|
||||||
|
"avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early
|
||||||
|
},
|
||||||
|
"searches": {
|
||||||
|
"total_search_count": total_search_count,
|
||||||
|
"avg_search_count": (total_search_count as f64) / (total_received as f64),
|
||||||
|
},
|
||||||
|
"scoring": {
|
||||||
|
"show_ranking_score": show_ranking_score,
|
||||||
|
"show_ranking_score_details": show_ranking_score_details,
|
||||||
|
},
|
||||||
|
"federation": {
|
||||||
|
"use_federation": use_federation,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,6 @@ use actix_web::{web, HttpRequest, HttpResponse};
|
|||||||
use index_scheduler::IndexScheduler;
|
use index_scheduler::IndexScheduler;
|
||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::tasks::KindWithContent;
|
use meilisearch_types::tasks::KindWithContent;
|
||||||
use serde_json::json;
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::Analytics;
|
||||||
@ -17,13 +16,15 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
|||||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
|
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created");
|
||||||
|
|
||||||
pub async fn create_snapshot(
|
pub async fn create_snapshot(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
|
analytics.publish(SnapshotAnalytics::default(), &req);
|
||||||
|
|
||||||
let task = KindWithContent::SnapshotCreation;
|
let task = KindWithContent::SnapshotCreation;
|
||||||
let uid = get_task_id(&req, &opt)?;
|
let uid = get_task_id(&req, &opt)?;
|
||||||
|
@ -8,10 +8,10 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes;
|
|||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::tasks::{IndexSwap, KindWithContent};
|
use meilisearch_types::tasks::{IndexSwap, KindWithContent};
|
||||||
use serde_json::json;
|
use serde::Serialize;
|
||||||
|
|
||||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::{Aggregate, Analytics};
|
||||||
use crate::error::MeilisearchHttpError;
|
use crate::error::MeilisearchHttpError;
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||||
@ -29,21 +29,36 @@ pub struct SwapIndexesPayload {
|
|||||||
indexes: Vec<IndexUid>,
|
indexes: Vec<IndexUid>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct IndexSwappedAnalytics {
|
||||||
|
swap_operation_number: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Aggregate for IndexSwappedAnalytics {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
"Indexes Swapped"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
swap_operation_number: self.swap_operation_number.max(new.swap_operation_number),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn swap_indexes(
|
pub async fn swap_indexes(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
|
||||||
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
|
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
analytics.publish(
|
analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req);
|
||||||
"Indexes Swapped".to_string(),
|
|
||||||
json!({
|
|
||||||
"swap_operation_number": params.len(),
|
|
||||||
}),
|
|
||||||
Some(&req),
|
|
||||||
);
|
|
||||||
let filters = index_scheduler.filters();
|
let filters = index_scheduler.filters();
|
||||||
|
|
||||||
let mut swaps = vec![];
|
let mut swaps = vec![];
|
||||||
|
@ -12,18 +12,17 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList};
|
|||||||
use meilisearch_types::task_view::TaskView;
|
use meilisearch_types::task_view::TaskView;
|
||||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status};
|
use meilisearch_types::tasks::{Kind, KindWithContent, Status};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json::json;
|
|
||||||
use time::format_description::well_known::Rfc3339;
|
use time::format_description::well_known::Rfc3339;
|
||||||
use time::macros::format_description;
|
use time::macros::format_description;
|
||||||
use time::{Date, Duration, OffsetDateTime, Time};
|
use time::{Date, Duration, OffsetDateTime, Time};
|
||||||
use tokio::task;
|
use tokio::task;
|
||||||
|
|
||||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::{Aggregate, AggregateMethod, Analytics};
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
use crate::Opt;
|
use crate::{aggregate_methods, Opt};
|
||||||
|
|
||||||
const DEFAULT_LIMIT: u32 = 20;
|
const DEFAULT_LIMIT: u32 = 20;
|
||||||
|
|
||||||
@ -158,12 +157,69 @@ impl TaskDeletionOrCancelationQuery {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
aggregate_methods!(
|
||||||
|
CancelTasks => "Tasks Canceled",
|
||||||
|
DeleteTasks => "Tasks Deleted",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct TaskFilterAnalytics<Method: AggregateMethod> {
|
||||||
|
filtered_by_uid: bool,
|
||||||
|
filtered_by_index_uid: bool,
|
||||||
|
filtered_by_type: bool,
|
||||||
|
filtered_by_status: bool,
|
||||||
|
filtered_by_canceled_by: bool,
|
||||||
|
filtered_by_before_enqueued_at: bool,
|
||||||
|
filtered_by_after_enqueued_at: bool,
|
||||||
|
filtered_by_before_started_at: bool,
|
||||||
|
filtered_by_after_started_at: bool,
|
||||||
|
filtered_by_before_finished_at: bool,
|
||||||
|
filtered_by_after_finished_at: bool,
|
||||||
|
|
||||||
|
#[serde(skip)]
|
||||||
|
marker: std::marker::PhantomData<Method>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Method: AggregateMethod + 'static> Aggregate for TaskFilterAnalytics<Method> {
|
||||||
|
fn event_name(&self) -> &'static str {
|
||||||
|
Method::event_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||||
|
Box::new(Self {
|
||||||
|
filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid,
|
||||||
|
filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid,
|
||||||
|
filtered_by_type: self.filtered_by_type | new.filtered_by_type,
|
||||||
|
filtered_by_status: self.filtered_by_status | new.filtered_by_status,
|
||||||
|
filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by,
|
||||||
|
filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at
|
||||||
|
| new.filtered_by_before_enqueued_at,
|
||||||
|
filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at
|
||||||
|
| new.filtered_by_after_enqueued_at,
|
||||||
|
filtered_by_before_started_at: self.filtered_by_before_started_at
|
||||||
|
| new.filtered_by_before_started_at,
|
||||||
|
filtered_by_after_started_at: self.filtered_by_after_started_at
|
||||||
|
| new.filtered_by_after_started_at,
|
||||||
|
filtered_by_before_finished_at: self.filtered_by_before_finished_at
|
||||||
|
| new.filtered_by_before_finished_at,
|
||||||
|
filtered_by_after_finished_at: self.filtered_by_after_finished_at
|
||||||
|
| new.filtered_by_after_finished_at,
|
||||||
|
|
||||||
|
marker: std::marker::PhantomData,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||||
|
serde_json::to_value(*self).unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn cancel_tasks(
|
async fn cancel_tasks(
|
||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
|
||||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
|
|
||||||
@ -172,21 +228,22 @@ async fn cancel_tasks(
|
|||||||
}
|
}
|
||||||
|
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Tasks Canceled".to_string(),
|
TaskFilterAnalytics::<CancelTasks> {
|
||||||
json!({
|
filtered_by_uid: params.uids.is_some(),
|
||||||
"filtered_by_uid": params.uids.is_some(),
|
filtered_by_index_uid: params.index_uids.is_some(),
|
||||||
"filtered_by_index_uid": params.index_uids.is_some(),
|
filtered_by_type: params.types.is_some(),
|
||||||
"filtered_by_type": params.types.is_some(),
|
filtered_by_status: params.statuses.is_some(),
|
||||||
"filtered_by_status": params.statuses.is_some(),
|
filtered_by_canceled_by: params.canceled_by.is_some(),
|
||||||
"filtered_by_canceled_by": params.canceled_by.is_some(),
|
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(),
|
||||||
"filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
|
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(),
|
||||||
"filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
|
filtered_by_before_started_at: params.before_started_at.is_some(),
|
||||||
"filtered_by_before_started_at": params.before_started_at.is_some(),
|
filtered_by_after_started_at: params.after_started_at.is_some(),
|
||||||
"filtered_by_after_started_at": params.after_started_at.is_some(),
|
filtered_by_before_finished_at: params.before_finished_at.is_some(),
|
||||||
"filtered_by_before_finished_at": params.before_finished_at.is_some(),
|
filtered_by_after_finished_at: params.after_finished_at.is_some(),
|
||||||
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
|
|
||||||
}),
|
marker: std::marker::PhantomData,
|
||||||
Some(&req),
|
},
|
||||||
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
let query = params.into_query();
|
let query = params.into_query();
|
||||||
@ -214,7 +271,7 @@ async fn delete_tasks(
|
|||||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
opt: web::Data<Opt>,
|
opt: web::Data<Opt>,
|
||||||
analytics: web::Data<dyn Analytics>,
|
analytics: web::Data<Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let params = params.into_inner();
|
let params = params.into_inner();
|
||||||
|
|
||||||
@ -223,22 +280,24 @@ async fn delete_tasks(
|
|||||||
}
|
}
|
||||||
|
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Tasks Deleted".to_string(),
|
TaskFilterAnalytics::<DeleteTasks> {
|
||||||
json!({
|
filtered_by_uid: params.uids.is_some(),
|
||||||
"filtered_by_uid": params.uids.is_some(),
|
filtered_by_index_uid: params.index_uids.is_some(),
|
||||||
"filtered_by_index_uid": params.index_uids.is_some(),
|
filtered_by_type: params.types.is_some(),
|
||||||
"filtered_by_type": params.types.is_some(),
|
filtered_by_status: params.statuses.is_some(),
|
||||||
"filtered_by_status": params.statuses.is_some(),
|
filtered_by_canceled_by: params.canceled_by.is_some(),
|
||||||
"filtered_by_canceled_by": params.canceled_by.is_some(),
|
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(),
|
||||||
"filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
|
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(),
|
||||||
"filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
|
filtered_by_before_started_at: params.before_started_at.is_some(),
|
||||||
"filtered_by_before_started_at": params.before_started_at.is_some(),
|
filtered_by_after_started_at: params.after_started_at.is_some(),
|
||||||
"filtered_by_after_started_at": params.after_started_at.is_some(),
|
filtered_by_before_finished_at: params.before_finished_at.is_some(),
|
||||||
"filtered_by_before_finished_at": params.before_finished_at.is_some(),
|
filtered_by_after_finished_at: params.after_finished_at.is_some(),
|
||||||
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
|
|
||||||
}),
|
marker: std::marker::PhantomData,
|
||||||
Some(&req),
|
},
|
||||||
|
&req,
|
||||||
);
|
);
|
||||||
|
|
||||||
let query = params.into_query();
|
let query = params.into_query();
|
||||||
|
|
||||||
let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes(
|
let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes(
|
||||||
|
@ -1195,8 +1195,13 @@ impl<'a> HitMaker<'a> {
|
|||||||
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
||||||
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
||||||
(None, _) => false,
|
(None, _) => false,
|
||||||
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
// vectors has no fid, so check its explicit name
|
||||||
(Some(_), None) => true,
|
(Some(_), None) => {
|
||||||
|
// unwrap as otherwise we'd go to the first one
|
||||||
|
let displayed_names = index.displayed_fields(rtxn)?.unwrap();
|
||||||
|
!displayed_names
|
||||||
|
.contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME)
|
||||||
|
}
|
||||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
||||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
||||||
};
|
};
|
||||||
|
@ -9,8 +9,7 @@ use urlencoding::encode as urlencode;
|
|||||||
|
|
||||||
use super::encoder::Encoder;
|
use super::encoder::Encoder;
|
||||||
use super::service::Service;
|
use super::service::Service;
|
||||||
use super::Value;
|
use super::{Owned, Shared, Value};
|
||||||
use super::{Owned, Shared};
|
|
||||||
use crate::json;
|
use crate::json;
|
||||||
|
|
||||||
pub struct Index<'a, State = Owned> {
|
pub struct Index<'a, State = Owned> {
|
||||||
|
@ -386,7 +386,6 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
|||||||
db_path: dir.as_ref().join("db"),
|
db_path: dir.as_ref().join("db"),
|
||||||
dump_dir: dir.as_ref().join("dumps"),
|
dump_dir: dir.as_ref().join("dumps"),
|
||||||
env: "development".to_owned(),
|
env: "development".to_owned(),
|
||||||
#[cfg(feature = "analytics")]
|
|
||||||
no_analytics: true,
|
no_analytics: true,
|
||||||
max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(),
|
max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(),
|
||||||
max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(),
|
max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(),
|
||||||
|
@ -9,8 +9,9 @@ use actix_web::test;
|
|||||||
use actix_web::test::TestRequest;
|
use actix_web::test::TestRequest;
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
use index_scheduler::IndexScheduler;
|
use index_scheduler::IndexScheduler;
|
||||||
|
use meilisearch::analytics::Analytics;
|
||||||
use meilisearch::search_queue::SearchQueue;
|
use meilisearch::search_queue::SearchQueue;
|
||||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
use meilisearch::{create_app, Opt, SubscriberForSecondLayer};
|
||||||
use meilisearch_auth::AuthController;
|
use meilisearch_auth::AuthController;
|
||||||
use tracing::level_filters::LevelFilter;
|
use tracing::level_filters::LevelFilter;
|
||||||
use tracing_subscriber::Layer;
|
use tracing_subscriber::Layer;
|
||||||
@ -141,7 +142,7 @@ impl Service {
|
|||||||
Data::new(search_queue),
|
Data::new(search_queue),
|
||||||
self.options.clone(),
|
self.options.clone(),
|
||||||
(route_layer_handle, stderr_layer_handle),
|
(route_layer_handle, stderr_layer_handle),
|
||||||
analytics::MockAnalytics::new(&self.options),
|
Data::new(Analytics::no_analytics()),
|
||||||
true,
|
true,
|
||||||
))
|
))
|
||||||
.await
|
.await
|
||||||
|
@ -23,8 +23,8 @@ async fn error_document_update_create_index_bad_uid() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn document_update_with_primary_key() {
|
async fn document_update_with_primary_key() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -32,15 +32,14 @@ async fn document_update_with_primary_key() {
|
|||||||
"content": "foo",
|
"content": "foo",
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
let (_response, code) = index.update_documents(documents, Some("primary")).await;
|
let (response, code) = index.update_documents(documents, Some("primary")).await;
|
||||||
assert_eq!(code, 202);
|
assert_eq!(code, 202);
|
||||||
|
|
||||||
index.wait_task(0).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let (response, code) = index.get_task(0).await;
|
let (response, code) = index.get_task(response.uid()).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["status"], "succeeded");
|
assert_eq!(response["status"], "succeeded");
|
||||||
assert_eq!(response["uid"], 0);
|
|
||||||
assert_eq!(response["type"], "documentAdditionOrUpdate");
|
assert_eq!(response["type"], "documentAdditionOrUpdate");
|
||||||
assert_eq!(response["details"]["indexedDocuments"], 1);
|
assert_eq!(response["details"]["indexedDocuments"], 1);
|
||||||
assert_eq!(response["details"]["receivedDocuments"], 1);
|
assert_eq!(response["details"]["receivedDocuments"], 1);
|
||||||
@ -52,8 +51,8 @@ async fn document_update_with_primary_key() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn update_document() {
|
async fn update_document() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -62,10 +61,10 @@ async fn update_document() {
|
|||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let (_response, code) = index.add_documents(documents, None).await;
|
let (response, code) = index.add_documents(documents, None).await;
|
||||||
assert_eq!(code, 202);
|
assert_eq!(code, 202);
|
||||||
|
|
||||||
index.wait_task(0).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -77,9 +76,9 @@ async fn update_document() {
|
|||||||
let (response, code) = index.update_documents(documents, None).await;
|
let (response, code) = index.update_documents(documents, None).await;
|
||||||
assert_eq!(code, 202, "response: {}", response);
|
assert_eq!(code, 202, "response: {}", response);
|
||||||
|
|
||||||
index.wait_task(1).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let (response, code) = index.get_task(1).await;
|
let (response, code) = index.get_task(response.uid()).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["status"], "succeeded");
|
assert_eq!(response["status"], "succeeded");
|
||||||
|
|
||||||
@ -96,8 +95,8 @@ async fn update_document() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn update_document_gzip_encoded() {
|
async fn update_document_gzip_encoded() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index_with_encoder("test", Encoder::Gzip);
|
let index = server.unique_index_with_encoder(Encoder::Gzip);
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -106,10 +105,10 @@ async fn update_document_gzip_encoded() {
|
|||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let (_response, code) = index.add_documents(documents, None).await;
|
let (response, code) = index.add_documents(documents, None).await;
|
||||||
assert_eq!(code, 202);
|
assert_eq!(code, 202);
|
||||||
|
|
||||||
index.wait_task(0).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -121,9 +120,9 @@ async fn update_document_gzip_encoded() {
|
|||||||
let (response, code) = index.update_documents(documents, None).await;
|
let (response, code) = index.update_documents(documents, None).await;
|
||||||
assert_eq!(code, 202, "response: {}", response);
|
assert_eq!(code, 202, "response: {}", response);
|
||||||
|
|
||||||
index.wait_task(1).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let (response, code) = index.get_task(1).await;
|
let (response, code) = index.get_task(response.uid()).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["status"], "succeeded");
|
assert_eq!(response["status"], "succeeded");
|
||||||
|
|
||||||
@ -140,12 +139,12 @@ async fn update_document_gzip_encoded() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn update_larger_dataset() {
|
async fn update_larger_dataset() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap();
|
let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap();
|
||||||
index.update_documents(documents, None).await;
|
let (task, _code) = index.update_documents(documents, None).await;
|
||||||
index.wait_task(0).await;
|
index.wait_task(task.uid()).await.succeeded();
|
||||||
let (response, code) = index.get_task(0).await;
|
let (response, code) = index.get_task(task.uid()).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["type"], "documentAdditionOrUpdate");
|
assert_eq!(response["type"], "documentAdditionOrUpdate");
|
||||||
assert_eq!(response["details"]["indexedDocuments"], 77);
|
assert_eq!(response["details"]["indexedDocuments"], 77);
|
||||||
@ -158,8 +157,8 @@ async fn update_larger_dataset() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn error_update_documents_bad_document_id() {
|
async fn error_update_documents_bad_document_id() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
index.create(Some("docid")).await;
|
index.create(Some("docid")).await;
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -167,8 +166,8 @@ async fn error_update_documents_bad_document_id() {
|
|||||||
"content": "foobar"
|
"content": "foobar"
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
index.update_documents(documents, None).await;
|
let (task, _code) = index.update_documents(documents, None).await;
|
||||||
let response = index.wait_task(1).await;
|
let response = index.wait_task(task.uid()).await;
|
||||||
assert_eq!(response["status"], json!("failed"));
|
assert_eq!(response["status"], json!("failed"));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["error"]["message"],
|
response["error"]["message"],
|
||||||
@ -186,8 +185,8 @@ async fn error_update_documents_bad_document_id() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn error_update_documents_missing_document_id() {
|
async fn error_update_documents_missing_document_id() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
index.create(Some("docid")).await;
|
index.create(Some("docid")).await;
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -195,8 +194,8 @@ async fn error_update_documents_missing_document_id() {
|
|||||||
"content": "foobar"
|
"content": "foobar"
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
index.update_documents(documents, None).await;
|
let (task, _code) = index.update_documents(documents, None).await;
|
||||||
let response = index.wait_task(1).await;
|
let response = index.wait_task(task.uid()).await;
|
||||||
assert_eq!(response["status"], "failed");
|
assert_eq!(response["status"], "failed");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["error"]["message"],
|
response["error"]["message"],
|
||||||
@ -212,8 +211,8 @@ async fn error_update_documents_missing_document_id() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn update_faceted_document() {
|
async fn update_faceted_document() {
|
||||||
let server = Server::new().await;
|
let server = Server::new_shared();
|
||||||
let index = server.index("test");
|
let index = server.unique_index();
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.update_settings(json!({
|
.update_settings(json!({
|
||||||
@ -221,7 +220,7 @@ async fn update_faceted_document() {
|
|||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
assert_eq!("202", code.as_str(), "{:?}", response);
|
assert_eq!("202", code.as_str(), "{:?}", response);
|
||||||
index.wait_task(0).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let documents: Vec<_> = (0..1000)
|
let documents: Vec<_> = (0..1000)
|
||||||
.map(|id| {
|
.map(|id| {
|
||||||
@ -232,10 +231,10 @@ async fn update_faceted_document() {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let (_response, code) = index.add_documents(documents.into(), None).await;
|
let (response, code) = index.add_documents(documents.into(), None).await;
|
||||||
assert_eq!(code, 202);
|
assert_eq!(code, 202);
|
||||||
|
|
||||||
index.wait_task(1).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
let documents = json!([
|
let documents = json!([
|
||||||
{
|
{
|
||||||
@ -247,7 +246,7 @@ async fn update_faceted_document() {
|
|||||||
let (response, code) = index.update_documents(documents, None).await;
|
let (response, code) = index.update_documents(documents, None).await;
|
||||||
assert_eq!(code, 202, "response: {}", response);
|
assert_eq!(code, 202, "response: {}", response);
|
||||||
|
|
||||||
index.wait_task(2).await;
|
index.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
index
|
index
|
||||||
.search(json!({"limit": 10}), |response, code| {
|
.search(json!({"limit": 10}), |response, code| {
|
||||||
|
@ -7,8 +7,9 @@ use std::str::FromStr;
|
|||||||
use actix_web::http::header::ContentType;
|
use actix_web::http::header::ContentType;
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
use meili_snap::snapshot;
|
use meili_snap::snapshot;
|
||||||
|
use meilisearch::analytics::Analytics;
|
||||||
use meilisearch::search_queue::SearchQueue;
|
use meilisearch::search_queue::SearchQueue;
|
||||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
use meilisearch::{create_app, Opt, SubscriberForSecondLayer};
|
||||||
use tracing::level_filters::LevelFilter;
|
use tracing::level_filters::LevelFilter;
|
||||||
use tracing_subscriber::layer::SubscriberExt;
|
use tracing_subscriber::layer::SubscriberExt;
|
||||||
use tracing_subscriber::Layer;
|
use tracing_subscriber::Layer;
|
||||||
@ -54,7 +55,7 @@ async fn basic_test_log_stream_route() {
|
|||||||
Data::new(search_queue),
|
Data::new(search_queue),
|
||||||
server.service.options.clone(),
|
server.service.options.clone(),
|
||||||
(route_layer_handle, stderr_layer_handle),
|
(route_layer_handle, stderr_layer_handle),
|
||||||
analytics::MockAnalytics::new(&server.service.options),
|
Data::new(Analytics::no_analytics()),
|
||||||
true,
|
true,
|
||||||
))
|
))
|
||||||
.await;
|
.await;
|
||||||
|
@ -568,6 +568,57 @@ async fn retrieve_vectors() {
|
|||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
|
// use explicit `_vectors` in displayed attributes
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} ))
|
||||||
|
.await;
|
||||||
|
assert_eq!(202, code, "{:?}", response);
|
||||||
|
index.wait_task(response.uid()).await;
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.search_post(
|
||||||
|
json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"title": "Captain Planet",
|
||||||
|
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||||
|
"id": "2",
|
||||||
|
"_vectors": {
|
||||||
|
"default": {
|
||||||
|
"embeddings": "[vectors]",
|
||||||
|
"regenerate": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Captain Marvel",
|
||||||
|
"desc": "a Shazam ersatz",
|
||||||
|
"id": "3",
|
||||||
|
"_vectors": {
|
||||||
|
"default": {
|
||||||
|
"embeddings": "[vectors]",
|
||||||
|
"regenerate": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Shazam!",
|
||||||
|
"desc": "a Captain Marvel ersatz",
|
||||||
|
"id": "1",
|
||||||
|
"_vectors": {
|
||||||
|
"default": {
|
||||||
|
"embeddings": "[vectors]",
|
||||||
|
"regenerate": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
// remove `_vectors` from displayed attributes
|
// remove `_vectors` from displayed attributes
|
||||||
let (response, code) =
|
let (response, code) =
|
||||||
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
|
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
|
||||||
|
@ -4,6 +4,53 @@ use crate::common::{GetAllDocumentsOptions, Server};
|
|||||||
use crate::json;
|
use crate::json;
|
||||||
use crate::vector::generate_default_user_provided_documents;
|
use crate::vector::generate_default_user_provided_documents;
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn field_unavailable_for_source() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("doggo");
|
||||||
|
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(value, @r###"
|
||||||
|
{
|
||||||
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"logsRoute": false,
|
||||||
|
"editDocumentsByFunction": false,
|
||||||
|
"containsFilter": false
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}},
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"400 Bad Request");
|
||||||
|
snapshot!(response, @r###"
|
||||||
|
{
|
||||||
|
"message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`",
|
||||||
|
"code": "invalid_settings_embedders",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"embedders": { "default": {"source": "openAi", "revision": "42"}},
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"400 Bad Request");
|
||||||
|
snapshot!(response, @r###"
|
||||||
|
{
|
||||||
|
"message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`",
|
||||||
|
"code": "invalid_settings_embedders",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn update_embedder() {
|
async fn update_embedder() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
|
@ -16,5 +16,6 @@ file-store = { path = "../file-store" }
|
|||||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
serde = { version = "1.0.209", features = ["derive"] }
|
serde = { version = "1.0.209", features = ["derive"] }
|
||||||
time = { version = "0.3.36", features = ["formatting"] }
|
time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] }
|
||||||
uuid = { version = "1.10.0", features = ["v4"], default-features = false }
|
uuid = { version = "1.10.0", features = ["v4"], default-features = false }
|
||||||
|
arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" }
|
||||||
|
@ -2,7 +2,7 @@ use std::fs::{read_dir, read_to_string, remove_file, File};
|
|||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::Context;
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use dump::{DumpWriter, IndexMetadata};
|
use dump::{DumpWriter, IndexMetadata};
|
||||||
use file_store::FileStore;
|
use file_store::FileStore;
|
||||||
@ -10,15 +10,16 @@ use meilisearch_auth::AuthController;
|
|||||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||||
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
|
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
use meilisearch_types::milli::index::{db_name, main_key};
|
|
||||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
||||||
use meilisearch_types::tasks::{Status, Task};
|
use meilisearch_types::tasks::{Status, Task};
|
||||||
use meilisearch_types::versioning::{create_version_file, get_version, parse_version};
|
use meilisearch_types::versioning::{get_version, parse_version};
|
||||||
use meilisearch_types::Index;
|
use meilisearch_types::Index;
|
||||||
use time::macros::format_description;
|
use time::macros::format_description;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
use upgrade::OfflineUpgrade;
|
||||||
use uuid_codec::UuidCodec;
|
use uuid_codec::UuidCodec;
|
||||||
|
|
||||||
|
mod upgrade;
|
||||||
mod uuid_codec;
|
mod uuid_codec;
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
@ -72,7 +73,7 @@ enum Command {
|
|||||||
///
|
///
|
||||||
/// Supported upgrade paths:
|
/// Supported upgrade paths:
|
||||||
///
|
///
|
||||||
/// - v1.9.0 -> v1.10.0
|
/// - v1.9.x -> v1.10.x -> v1.11.x
|
||||||
OfflineUpgrade {
|
OfflineUpgrade {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
target_version: String,
|
target_version: String,
|
||||||
@ -96,425 +97,6 @@ fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct OfflineUpgrade {
|
|
||||||
db_path: PathBuf,
|
|
||||||
current_version: (String, String, String),
|
|
||||||
target_version: (String, String, String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl OfflineUpgrade {
|
|
||||||
fn upgrade(self) -> anyhow::Result<()> {
|
|
||||||
// TODO: if we make this process support more versions, introduce a more flexible way of checking for the version
|
|
||||||
// currently only supports v1.9 to v1.10
|
|
||||||
let (current_major, current_minor, current_patch) = &self.current_version;
|
|
||||||
|
|
||||||
match (current_major.as_str(), current_minor.as_str(), current_patch.as_str()) {
|
|
||||||
("1", "9", _) => {}
|
|
||||||
_ => {
|
|
||||||
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let (target_major, target_minor, target_patch) = &self.target_version;
|
|
||||||
|
|
||||||
match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
|
|
||||||
("1", "10", _) => {}
|
|
||||||
_ => {
|
|
||||||
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
|
|
||||||
|
|
||||||
self.v1_9_to_v1_10()?;
|
|
||||||
|
|
||||||
println!("Writing VERSION file");
|
|
||||||
|
|
||||||
create_version_file(&self.db_path, target_major, target_minor, target_patch)
|
|
||||||
.context("while writing VERSION file after the upgrade")?;
|
|
||||||
|
|
||||||
println!("Success");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn v1_9_to_v1_10(&self) -> anyhow::Result<()> {
|
|
||||||
// 2 changes here
|
|
||||||
|
|
||||||
// 1. date format. needs to be done before opening the Index
|
|
||||||
// 2. REST embedders. We don't support this case right now, so bail
|
|
||||||
|
|
||||||
let index_scheduler_path = self.db_path.join("tasks");
|
|
||||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
|
||||||
.with_context(|| {
|
|
||||||
format!("While trying to open {:?}", index_scheduler_path.display())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let mut sched_wtxn = env.write_txn()?;
|
|
||||||
|
|
||||||
let index_mapping: Database<Str, UuidCodec> =
|
|
||||||
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
|
|
||||||
|
|
||||||
let index_stats: Database<UuidCodec, Unspecified> =
|
|
||||||
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
|
|
||||||
format!("While trying to open {:?}", index_scheduler_path.display())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let index_count =
|
|
||||||
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
|
|
||||||
|
|
||||||
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
|
|
||||||
// 1. immutably for the iteration
|
|
||||||
// 2. mutably for updating index stats
|
|
||||||
let indexes: Vec<_> = index_mapping
|
|
||||||
.iter(&sched_wtxn)?
|
|
||||||
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut rest_embedders = Vec::new();
|
|
||||||
|
|
||||||
let mut unwrapped_indexes = Vec::new();
|
|
||||||
|
|
||||||
// check that update can take place
|
|
||||||
for (index_index, result) in indexes.into_iter().enumerate() {
|
|
||||||
let (uid, uuid) = result?;
|
|
||||||
let index_path = self.db_path.join("indexes").join(uuid.to_string());
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
|
|
||||||
index_index + 1,
|
|
||||||
index_path.display()
|
|
||||||
);
|
|
||||||
|
|
||||||
let index_env = unsafe {
|
|
||||||
// FIXME: fetch the 25 magic number from the index file
|
|
||||||
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
|
||||||
format!("while opening index {uid} at '{}'", index_path.display())
|
|
||||||
})?
|
|
||||||
};
|
|
||||||
|
|
||||||
let index_txn = index_env.read_txn().with_context(|| {
|
|
||||||
format!(
|
|
||||||
"while obtaining a write transaction for index {uid} at {}",
|
|
||||||
index_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
println!("\t- Checking for incompatible embedders (REST embedders)");
|
|
||||||
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
|
|
||||||
|
|
||||||
if rest_embedders_for_index.is_empty() {
|
|
||||||
unwrapped_indexes.push((uid, uuid));
|
|
||||||
} else {
|
|
||||||
// no need to add to unwrapped indexes because we'll exit early
|
|
||||||
rest_embedders.push((uid, rest_embedders_for_index));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !rest_embedders.is_empty() {
|
|
||||||
let rest_embedders = rest_embedders
|
|
||||||
.into_iter()
|
|
||||||
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
|
|
||||||
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join("\n");
|
|
||||||
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
|
|
||||||
The database has not been modified and is still a valid v1.9 database.");
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("Update can take place, updating");
|
|
||||||
|
|
||||||
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
|
|
||||||
let index_path = self.db_path.join("indexes").join(uuid.to_string());
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"[{}/{index_count}]Updating index `{uid}` at `{}`",
|
|
||||||
index_index + 1,
|
|
||||||
index_path.display()
|
|
||||||
);
|
|
||||||
|
|
||||||
let index_env = unsafe {
|
|
||||||
// FIXME: fetch the 25 magic number from the index file
|
|
||||||
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
|
||||||
format!("while opening index {uid} at '{}'", index_path.display())
|
|
||||||
})?
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
|
||||||
format!(
|
|
||||||
"while obtaining a write transaction for index `{uid}` at `{}`",
|
|
||||||
index_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
println!("\t- Updating index stats");
|
|
||||||
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
|
|
||||||
println!("\t- Updating date format");
|
|
||||||
update_date_format(&uid, &index_env, &mut index_wtxn)?;
|
|
||||||
|
|
||||||
index_wtxn.commit().with_context(|| {
|
|
||||||
format!(
|
|
||||||
"while committing the write txn for index `{uid}` at {}",
|
|
||||||
index_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
|
|
||||||
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
|
|
||||||
|
|
||||||
println!("Upgrading database succeeded");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub mod v1_9 {
|
|
||||||
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
|
||||||
|
|
||||||
/// The statistics that can be computed from an `Index` object.
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
|
||||||
pub struct IndexStats {
|
|
||||||
/// Number of documents in the index.
|
|
||||||
pub number_of_documents: u64,
|
|
||||||
/// Size taken up by the index' DB, in bytes.
|
|
||||||
///
|
|
||||||
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
|
||||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
|
||||||
/// `used_database_size` that only includes the size of the used pages.
|
|
||||||
pub database_size: u64,
|
|
||||||
/// Size taken by the used pages of the index' DB, in bytes.
|
|
||||||
///
|
|
||||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
|
||||||
/// this value is typically smaller than `database_size`.
|
|
||||||
pub used_database_size: u64,
|
|
||||||
/// Association of every field name with the number of times it occurs in the documents.
|
|
||||||
pub field_distribution: FieldDistribution,
|
|
||||||
/// Creation date of the index.
|
|
||||||
pub created_at: time::OffsetDateTime,
|
|
||||||
/// Date of the last update of the index.
|
|
||||||
pub updated_at: time::OffsetDateTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
|
||||||
pub struct IndexEmbeddingConfig {
|
|
||||||
pub name: String,
|
|
||||||
pub config: EmbeddingConfig,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub struct EmbeddingConfig {
|
|
||||||
/// Options of the embedder, specific to each kind of embedder
|
|
||||||
pub embedder_options: EmbedderOptions,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Options of an embedder, specific to each kind of embedder.
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub enum EmbedderOptions {
|
|
||||||
HuggingFace(hf::EmbedderOptions),
|
|
||||||
OpenAi(openai::EmbedderOptions),
|
|
||||||
Ollama(ollama::EmbedderOptions),
|
|
||||||
UserProvided(manual::EmbedderOptions),
|
|
||||||
Rest(rest::EmbedderOptions),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for EmbedderOptions {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mod hf {
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub struct EmbedderOptions {
|
|
||||||
pub model: String,
|
|
||||||
pub revision: Option<String>,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mod openai {
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub struct EmbedderOptions {
|
|
||||||
pub api_key: Option<String>,
|
|
||||||
pub dimensions: Option<usize>,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mod ollama {
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub struct EmbedderOptions {
|
|
||||||
pub embedding_model: String,
|
|
||||||
pub url: Option<String>,
|
|
||||||
pub api_key: Option<String>,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mod manual {
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
|
||||||
pub struct EmbedderOptions {
|
|
||||||
pub dimensions: usize,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mod rest {
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
|
|
||||||
pub struct EmbedderOptions {
|
|
||||||
pub api_key: Option<String>,
|
|
||||||
pub dimensions: Option<usize>,
|
|
||||||
pub url: String,
|
|
||||||
pub input_field: Vec<String>,
|
|
||||||
// path to the array of embeddings
|
|
||||||
pub path_to_embeddings: Vec<String>,
|
|
||||||
// shape of a single embedding
|
|
||||||
pub embedding_object: Vec<String>,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type OffsetDateTime = time::OffsetDateTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub mod v1_10 {
|
|
||||||
use crate::v1_9;
|
|
||||||
|
|
||||||
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
|
||||||
|
|
||||||
/// The statistics that can be computed from an `Index` object.
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
|
||||||
pub struct IndexStats {
|
|
||||||
/// Number of documents in the index.
|
|
||||||
pub number_of_documents: u64,
|
|
||||||
/// Size taken up by the index' DB, in bytes.
|
|
||||||
///
|
|
||||||
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
|
||||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
|
||||||
/// `used_database_size` that only includes the size of the used pages.
|
|
||||||
pub database_size: u64,
|
|
||||||
/// Size taken by the used pages of the index' DB, in bytes.
|
|
||||||
///
|
|
||||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
|
||||||
/// this value is typically smaller than `database_size`.
|
|
||||||
pub used_database_size: u64,
|
|
||||||
/// Association of every field name with the number of times it occurs in the documents.
|
|
||||||
pub field_distribution: FieldDistribution,
|
|
||||||
/// Creation date of the index.
|
|
||||||
#[serde(with = "time::serde::rfc3339")]
|
|
||||||
pub created_at: time::OffsetDateTime,
|
|
||||||
/// Date of the last update of the index.
|
|
||||||
#[serde(with = "time::serde::rfc3339")]
|
|
||||||
pub updated_at: time::OffsetDateTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<v1_9::IndexStats> for IndexStats {
|
|
||||||
fn from(
|
|
||||||
v1_9::IndexStats {
|
|
||||||
number_of_documents,
|
|
||||||
database_size,
|
|
||||||
used_database_size,
|
|
||||||
field_distribution,
|
|
||||||
created_at,
|
|
||||||
updated_at,
|
|
||||||
}: v1_9::IndexStats,
|
|
||||||
) -> Self {
|
|
||||||
IndexStats {
|
|
||||||
number_of_documents,
|
|
||||||
database_size,
|
|
||||||
used_database_size,
|
|
||||||
field_distribution,
|
|
||||||
created_at,
|
|
||||||
updated_at,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize)]
|
|
||||||
#[serde(transparent)]
|
|
||||||
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn update_index_stats(
|
|
||||||
index_stats: Database<UuidCodec, Unspecified>,
|
|
||||||
index_uid: &str,
|
|
||||||
index_uuid: uuid::Uuid,
|
|
||||||
sched_wtxn: &mut RwTxn,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let ctx = || format!("while updating index stats for index `{index_uid}`");
|
|
||||||
|
|
||||||
let stats: Option<v1_9::IndexStats> = index_stats
|
|
||||||
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
|
|
||||||
.get(sched_wtxn, &index_uuid)
|
|
||||||
.with_context(ctx)?;
|
|
||||||
|
|
||||||
if let Some(stats) = stats {
|
|
||||||
let stats: v1_10::IndexStats = stats.into();
|
|
||||||
|
|
||||||
index_stats
|
|
||||||
.remap_data_type::<SerdeJson<v1_10::IndexStats>>()
|
|
||||||
.put(sched_wtxn, &index_uuid, &stats)
|
|
||||||
.with_context(ctx)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn update_date_format(
|
|
||||||
index_uid: &str,
|
|
||||||
index_env: &Env,
|
|
||||||
index_wtxn: &mut RwTxn,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
|
|
||||||
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
|
|
||||||
|
|
||||||
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
|
|
||||||
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_rest_embedders(
|
|
||||||
index_uid: &str,
|
|
||||||
index_env: &Env,
|
|
||||||
index_txn: &RoTxn,
|
|
||||||
) -> anyhow::Result<Vec<String>> {
|
|
||||||
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
|
|
||||||
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
|
|
||||||
|
|
||||||
let mut rest_embedders = vec![];
|
|
||||||
|
|
||||||
for config in main
|
|
||||||
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
|
|
||||||
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
|
|
||||||
.unwrap_or_default()
|
|
||||||
{
|
|
||||||
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
|
|
||||||
rest_embedders.push(config.name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(rest_embedders)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn date_round_trip(
|
|
||||||
wtxn: &mut RwTxn,
|
|
||||||
index_uid: &str,
|
|
||||||
db: Database<Unspecified, Unspecified>,
|
|
||||||
key: &str,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let datetime =
|
|
||||||
db.remap_types::<Str, SerdeJson<v1_9::OffsetDateTime>>().get(wtxn, key).with_context(
|
|
||||||
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if let Some(datetime) = datetime {
|
|
||||||
db.remap_types::<Str, SerdeJson<v1_10::OffsetDateTime>>()
|
|
||||||
.put(wtxn, key, &v1_10::OffsetDateTime(datetime))
|
|
||||||
.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"could not write `{key}` while updating date format for index `{index_uid}`"
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Clears the task queue located at `db_path`.
|
/// Clears the task queue located at `db_path`.
|
||||||
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
||||||
let path = db_path.join("tasks");
|
let path = db_path.join("tasks");
|
||||||
|
73
crates/meilitool/src/upgrade/mod.rs
Normal file
73
crates/meilitool/src/upgrade/mod.rs
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
mod v1_10;
|
||||||
|
mod v1_11;
|
||||||
|
mod v1_9;
|
||||||
|
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use meilisearch_types::versioning::create_version_file;
|
||||||
|
|
||||||
|
use v1_10::v1_9_to_v1_10;
|
||||||
|
|
||||||
|
use crate::upgrade::v1_11::v1_10_to_v1_11;
|
||||||
|
|
||||||
|
pub struct OfflineUpgrade {
|
||||||
|
pub db_path: PathBuf,
|
||||||
|
pub current_version: (String, String, String),
|
||||||
|
pub target_version: (String, String, String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OfflineUpgrade {
|
||||||
|
pub fn upgrade(self) -> anyhow::Result<()> {
|
||||||
|
let upgrade_list = [
|
||||||
|
(v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"),
|
||||||
|
(v1_10_to_v1_11, "1", "11", "0"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let (current_major, current_minor, current_patch) = &self.current_version;
|
||||||
|
|
||||||
|
let start_at = match (
|
||||||
|
current_major.as_str(),
|
||||||
|
current_minor.as_str(),
|
||||||
|
current_patch.as_str(),
|
||||||
|
) {
|
||||||
|
("1", "9", _) => 0,
|
||||||
|
("1", "10", _) => 1,
|
||||||
|
_ => {
|
||||||
|
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let (target_major, target_minor, target_patch) = &self.target_version;
|
||||||
|
|
||||||
|
let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
|
||||||
|
("1", "10", _) => 0,
|
||||||
|
("1", "11", _) => 1,
|
||||||
|
(major, _, _) if major.starts_with('v') => {
|
||||||
|
bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
|
||||||
|
|
||||||
|
#[allow(clippy::needless_range_loop)]
|
||||||
|
for index in start_at..=ends_at {
|
||||||
|
let (func, major, minor, patch) = upgrade_list[index];
|
||||||
|
(func)(&self.db_path)?;
|
||||||
|
println!("Done");
|
||||||
|
// We're writing the version file just in case an issue arise _while_ upgrading.
|
||||||
|
// We don't want the DB to fail in an unknown state.
|
||||||
|
println!("Writing VERSION file");
|
||||||
|
|
||||||
|
create_version_file(&self.db_path, major, minor, patch)
|
||||||
|
.context("while writing VERSION file after the upgrade")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Success");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
289
crates/meilitool/src/upgrade/v1_10.rs
Normal file
289
crates/meilitool/src/upgrade/v1_10.rs
Normal file
@ -0,0 +1,289 @@
|
|||||||
|
use anyhow::bail;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use meilisearch_types::{
|
||||||
|
heed::{
|
||||||
|
types::{SerdeJson, Str},
|
||||||
|
Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
|
||||||
|
},
|
||||||
|
milli::index::{db_name, main_key},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
|
||||||
|
|
||||||
|
use super::v1_9;
|
||||||
|
|
||||||
|
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
||||||
|
|
||||||
|
/// The statistics that can be computed from an `Index` object.
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||||
|
pub struct IndexStats {
|
||||||
|
/// Number of documents in the index.
|
||||||
|
pub number_of_documents: u64,
|
||||||
|
/// Size taken up by the index' DB, in bytes.
|
||||||
|
///
|
||||||
|
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
||||||
|
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||||
|
/// `used_database_size` that only includes the size of the used pages.
|
||||||
|
pub database_size: u64,
|
||||||
|
/// Size taken by the used pages of the index' DB, in bytes.
|
||||||
|
///
|
||||||
|
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||||
|
/// this value is typically smaller than `database_size`.
|
||||||
|
pub used_database_size: u64,
|
||||||
|
/// Association of every field name with the number of times it occurs in the documents.
|
||||||
|
pub field_distribution: FieldDistribution,
|
||||||
|
/// Creation date of the index.
|
||||||
|
#[serde(with = "time::serde::rfc3339")]
|
||||||
|
pub created_at: time::OffsetDateTime,
|
||||||
|
/// Date of the last update of the index.
|
||||||
|
#[serde(with = "time::serde::rfc3339")]
|
||||||
|
pub updated_at: time::OffsetDateTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<v1_9::IndexStats> for IndexStats {
|
||||||
|
fn from(
|
||||||
|
v1_9::IndexStats {
|
||||||
|
number_of_documents,
|
||||||
|
database_size,
|
||||||
|
used_database_size,
|
||||||
|
field_distribution,
|
||||||
|
created_at,
|
||||||
|
updated_at,
|
||||||
|
}: v1_9::IndexStats,
|
||||||
|
) -> Self {
|
||||||
|
IndexStats {
|
||||||
|
number_of_documents,
|
||||||
|
database_size,
|
||||||
|
used_database_size,
|
||||||
|
field_distribution,
|
||||||
|
created_at: created_at.0,
|
||||||
|
updated_at: updated_at.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(transparent)]
|
||||||
|
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
|
||||||
|
|
||||||
|
fn update_index_stats(
|
||||||
|
index_stats: Database<UuidCodec, Unspecified>,
|
||||||
|
index_uid: &str,
|
||||||
|
index_uuid: uuid::Uuid,
|
||||||
|
sched_wtxn: &mut RwTxn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let ctx = || format!("while updating index stats for index `{index_uid}`");
|
||||||
|
|
||||||
|
let stats: Option<&str> = index_stats
|
||||||
|
.remap_data_type::<Str>()
|
||||||
|
.get(sched_wtxn, &index_uuid)
|
||||||
|
.with_context(ctx)
|
||||||
|
.with_context(|| "While reading value")?;
|
||||||
|
dbg!(stats);
|
||||||
|
|
||||||
|
let stats: Option<v1_9::IndexStats> = index_stats
|
||||||
|
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
|
||||||
|
.get(sched_wtxn, &index_uuid)
|
||||||
|
.with_context(ctx)
|
||||||
|
.with_context(|| "While reading value")?;
|
||||||
|
|
||||||
|
if let Some(stats) = stats {
|
||||||
|
let stats: self::IndexStats = stats.into();
|
||||||
|
|
||||||
|
index_stats
|
||||||
|
.remap_data_type::<SerdeJson<self::IndexStats>>()
|
||||||
|
.put(sched_wtxn, &index_uuid, &stats)
|
||||||
|
.with_context(ctx)
|
||||||
|
.with_context(|| "While writing value")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update_date_format(
|
||||||
|
index_uid: &str,
|
||||||
|
index_env: &Env,
|
||||||
|
index_wtxn: &mut RwTxn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
|
||||||
|
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
|
||||||
|
|
||||||
|
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
|
||||||
|
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_rest_embedders(
|
||||||
|
index_uid: &str,
|
||||||
|
index_env: &Env,
|
||||||
|
index_txn: &RoTxn,
|
||||||
|
) -> anyhow::Result<Vec<String>> {
|
||||||
|
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
|
||||||
|
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
|
||||||
|
|
||||||
|
let mut rest_embedders = vec![];
|
||||||
|
|
||||||
|
for config in main
|
||||||
|
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
|
||||||
|
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
|
||||||
|
.unwrap_or_default()
|
||||||
|
{
|
||||||
|
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
|
||||||
|
rest_embedders.push(config.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(rest_embedders)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn date_round_trip(
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
index_uid: &str,
|
||||||
|
db: Database<Unspecified, Unspecified>,
|
||||||
|
key: &str,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let datetime =
|
||||||
|
db.remap_types::<Str, SerdeJson<v1_9::LegacyDateTime>>().get(wtxn, key).with_context(
|
||||||
|
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
if let Some(datetime) = datetime {
|
||||||
|
db.remap_types::<Str, SerdeJson<self::OffsetDateTime>>()
|
||||||
|
.put(wtxn, key, &self::OffsetDateTime(datetime.0))
|
||||||
|
.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"could not write `{key}` while updating date format for index `{index_uid}`"
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> {
|
||||||
|
println!("Upgrading from v1.9.0 to v1.10.0");
|
||||||
|
// 2 changes here
|
||||||
|
|
||||||
|
// 1. date format. needs to be done before opening the Index
|
||||||
|
// 2. REST embedders. We don't support this case right now, so bail
|
||||||
|
|
||||||
|
let index_scheduler_path = db_path.join("tasks");
|
||||||
|
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||||
|
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||||
|
|
||||||
|
let mut sched_wtxn = env.write_txn()?;
|
||||||
|
|
||||||
|
let index_mapping: Database<Str, UuidCodec> =
|
||||||
|
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
|
||||||
|
|
||||||
|
let index_stats: Database<UuidCodec, Unspecified> =
|
||||||
|
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
|
||||||
|
format!("While trying to open {:?}", index_scheduler_path.display())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let index_count =
|
||||||
|
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
|
||||||
|
|
||||||
|
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
|
||||||
|
// 1. immutably for the iteration
|
||||||
|
// 2. mutably for updating index stats
|
||||||
|
let indexes: Vec<_> = index_mapping
|
||||||
|
.iter(&sched_wtxn)?
|
||||||
|
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut rest_embedders = Vec::new();
|
||||||
|
|
||||||
|
let mut unwrapped_indexes = Vec::new();
|
||||||
|
|
||||||
|
// check that update can take place
|
||||||
|
for (index_index, result) in indexes.into_iter().enumerate() {
|
||||||
|
let (uid, uuid) = result?;
|
||||||
|
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
|
||||||
|
index_index + 1,
|
||||||
|
index_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
let index_env = unsafe {
|
||||||
|
// FIXME: fetch the 25 magic number from the index file
|
||||||
|
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||||
|
format!("while opening index {uid} at '{}'", index_path.display())
|
||||||
|
})?
|
||||||
|
};
|
||||||
|
|
||||||
|
let index_txn = index_env.read_txn().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"while obtaining a write transaction for index {uid} at {}",
|
||||||
|
index_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
println!("\t- Checking for incompatible embedders (REST embedders)");
|
||||||
|
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
|
||||||
|
|
||||||
|
if rest_embedders_for_index.is_empty() {
|
||||||
|
unwrapped_indexes.push((uid, uuid));
|
||||||
|
} else {
|
||||||
|
// no need to add to unwrapped indexes because we'll exit early
|
||||||
|
rest_embedders.push((uid, rest_embedders_for_index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !rest_embedders.is_empty() {
|
||||||
|
let rest_embedders = rest_embedders
|
||||||
|
.into_iter()
|
||||||
|
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
|
||||||
|
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n");
|
||||||
|
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
|
||||||
|
The database has not been modified and is still a valid v1.9 database.");
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Update can take place, updating");
|
||||||
|
|
||||||
|
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
|
||||||
|
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"[{}/{index_count}]Updating index `{uid}` at `{}`",
|
||||||
|
index_index + 1,
|
||||||
|
index_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
let index_env = unsafe {
|
||||||
|
// FIXME: fetch the 25 magic number from the index file
|
||||||
|
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||||
|
format!("while opening index {uid} at '{}'", index_path.display())
|
||||||
|
})?
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"while obtaining a write transaction for index `{uid}` at `{}`",
|
||||||
|
index_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
println!("\t- Updating index stats");
|
||||||
|
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
|
||||||
|
println!("\t- Updating date format");
|
||||||
|
update_date_format(&uid, &index_env, &mut index_wtxn)?;
|
||||||
|
|
||||||
|
index_wtxn.commit().with_context(|| {
|
||||||
|
format!("while committing the write txn for index `{uid}` at {}", index_path.display())
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
|
||||||
|
|
||||||
|
println!("Upgrading database succeeded");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
85
crates/meilitool/src/upgrade/v1_11.rs
Normal file
85
crates/meilitool/src/upgrade/v1_11.rs
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
//! The breaking changes that happened between the v1.10 and the v1.11 are:
|
||||||
|
//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0
|
||||||
|
//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata.
|
||||||
|
//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB.
|
||||||
|
//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything.
|
||||||
|
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use meilisearch_types::{
|
||||||
|
heed::{types::Str, Database, EnvOpenOptions},
|
||||||
|
milli::index::db_name,
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
|
||||||
|
|
||||||
|
pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> {
|
||||||
|
println!("Upgrading from v1.10.0 to v1.11.0");
|
||||||
|
|
||||||
|
let index_scheduler_path = db_path.join("tasks");
|
||||||
|
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||||
|
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||||
|
|
||||||
|
let sched_rtxn = env.read_txn()?;
|
||||||
|
|
||||||
|
let index_mapping: Database<Str, UuidCodec> =
|
||||||
|
try_opening_database(&env, &sched_rtxn, "index-mapping")?;
|
||||||
|
|
||||||
|
let index_count =
|
||||||
|
index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?;
|
||||||
|
|
||||||
|
let indexes: Vec<_> = index_mapping
|
||||||
|
.iter(&sched_rtxn)?
|
||||||
|
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
for (index_index, result) in indexes.into_iter().enumerate() {
|
||||||
|
let (uid, uuid) = result?;
|
||||||
|
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"[{}/{index_count}]Updating embeddings for `{uid}` at `{}`",
|
||||||
|
index_index + 1,
|
||||||
|
index_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
let index_env = unsafe {
|
||||||
|
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||||
|
format!("while opening index {uid} at '{}'", index_path.display())
|
||||||
|
})?
|
||||||
|
};
|
||||||
|
|
||||||
|
let index_rtxn = index_env.read_txn().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"while obtaining a read transaction for index {uid} at {}",
|
||||||
|
index_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let index_read_database =
|
||||||
|
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
|
||||||
|
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||||
|
|
||||||
|
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"while obtaining a write transaction for index {uid} at {}",
|
||||||
|
index_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let index_write_database =
|
||||||
|
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
|
||||||
|
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||||
|
|
||||||
|
arroy_v04_to_v05::ugrade_from_prev_version(
|
||||||
|
&index_rtxn,
|
||||||
|
index_read_database,
|
||||||
|
&mut index_wtxn,
|
||||||
|
index_write_database,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
index_wtxn.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
158
crates/meilitool/src/upgrade/v1_9.rs
Normal file
158
crates/meilitool/src/upgrade/v1_9.rs
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use time::{Date, OffsetDateTime, Time, UtcOffset};
|
||||||
|
|
||||||
|
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
||||||
|
|
||||||
|
/// The statistics that can be computed from an `Index` object.
|
||||||
|
#[derive(serde::Deserialize, Debug)]
|
||||||
|
pub struct IndexStats {
|
||||||
|
/// Number of documents in the index.
|
||||||
|
pub number_of_documents: u64,
|
||||||
|
/// Size taken up by the index' DB, in bytes.
|
||||||
|
///
|
||||||
|
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
||||||
|
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||||
|
/// `used_database_size` that only includes the size of the used pages.
|
||||||
|
pub database_size: u64,
|
||||||
|
/// Size taken by the used pages of the index' DB, in bytes.
|
||||||
|
///
|
||||||
|
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||||
|
/// this value is typically smaller than `database_size`.
|
||||||
|
pub used_database_size: u64,
|
||||||
|
/// Association of every field name with the number of times it occurs in the documents.
|
||||||
|
pub field_distribution: FieldDistribution,
|
||||||
|
/// Creation date of the index.
|
||||||
|
pub created_at: LegacyDateTime,
|
||||||
|
/// Date of the last update of the index.
|
||||||
|
pub updated_at: LegacyDateTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct IndexEmbeddingConfig {
|
||||||
|
pub name: String,
|
||||||
|
pub config: EmbeddingConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct EmbeddingConfig {
|
||||||
|
/// Options of the embedder, specific to each kind of embedder
|
||||||
|
pub embedder_options: EmbedderOptions,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Options of an embedder, specific to each kind of embedder.
|
||||||
|
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub enum EmbedderOptions {
|
||||||
|
HuggingFace(hf::EmbedderOptions),
|
||||||
|
OpenAi(openai::EmbedderOptions),
|
||||||
|
Ollama(ollama::EmbedderOptions),
|
||||||
|
UserProvided(manual::EmbedderOptions),
|
||||||
|
Rest(rest::EmbedderOptions),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EmbedderOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mod hf {
|
||||||
|
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct EmbedderOptions {
|
||||||
|
pub model: String,
|
||||||
|
pub revision: Option<String>,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod openai {
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct EmbedderOptions {
|
||||||
|
pub api_key: Option<String>,
|
||||||
|
pub dimensions: Option<usize>,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod ollama {
|
||||||
|
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct EmbedderOptions {
|
||||||
|
pub embedding_model: String,
|
||||||
|
pub url: Option<String>,
|
||||||
|
pub api_key: Option<String>,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod manual {
|
||||||
|
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct EmbedderOptions {
|
||||||
|
pub dimensions: usize,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod rest {
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
|
||||||
|
pub struct EmbedderOptions {
|
||||||
|
pub api_key: Option<String>,
|
||||||
|
pub dimensions: Option<usize>,
|
||||||
|
pub url: String,
|
||||||
|
pub input_field: Vec<String>,
|
||||||
|
// path to the array of embeddings
|
||||||
|
pub path_to_embeddings: Vec<String>,
|
||||||
|
// shape of a single embedding
|
||||||
|
pub embedding_object: Vec<String>,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A datetime from Meilisearch v1.9 with an unspecified format.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct LegacyDateTime(pub OffsetDateTime);
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for LegacyDateTime {
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
struct Visitor;
|
||||||
|
impl<'de> serde::de::Visitor<'de> for Visitor {
|
||||||
|
type Value = OffsetDateTime;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(formatter, "a valid datetime")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Comes from a binary. The legacy format is:
|
||||||
|
// 2024-11-04 13:32:08.48368 +00:00:00
|
||||||
|
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]");
|
||||||
|
OffsetDateTime::parse(v, format).map_err(E::custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Comes from the docker image, the legacy format is:
|
||||||
|
// [2024, 309, 17, 15, 1, 698184971, 0,0,0]
|
||||||
|
// year, day in year, hour, minute, sec, subsec , offset stuff
|
||||||
|
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: serde::de::SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
let mut vec = Vec::new();
|
||||||
|
// We must deserialize the value as `i64` because the largest values are `u32` and `i32`
|
||||||
|
while let Some(el) = seq.next_element::<i64>()? {
|
||||||
|
vec.push(el);
|
||||||
|
}
|
||||||
|
if vec.len() != 9 {
|
||||||
|
return Err(serde::de::Error::custom(format!(
|
||||||
|
"Invalid datetime, received an array of {} elements instead of 9",
|
||||||
|
vec.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Ok(OffsetDateTime::new_in_offset(
|
||||||
|
Date::from_ordinal_date(vec[0] as i32, vec[1] as u16)
|
||||||
|
.map_err(serde::de::Error::custom)?,
|
||||||
|
Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32)
|
||||||
|
.map_err(serde::de::Error::custom)?,
|
||||||
|
UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8)
|
||||||
|
.map_err(serde::de::Error::custom)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
deserializer.deserialize_any(Visitor).map(LegacyDateTime)
|
||||||
|
}
|
||||||
|
}
|
@ -15,7 +15,7 @@ license.workspace = true
|
|||||||
bimap = { version = "0.6.3", features = ["serde"] }
|
bimap = { version = "0.6.3", features = ["serde"] }
|
||||||
bincode = "1.3.3"
|
bincode = "1.3.3"
|
||||||
bstr = "1.9.1"
|
bstr = "1.9.1"
|
||||||
bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
|
bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] }
|
||||||
byteorder = "1.5.0"
|
byteorder = "1.5.0"
|
||||||
charabia = { version = "0.9.1", default-features = false }
|
charabia = { version = "0.9.1", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
@ -79,8 +79,8 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
|
|||||||
] }
|
] }
|
||||||
tiktoken-rs = "0.5.9"
|
tiktoken-rs = "0.5.9"
|
||||||
liquid = "0.26.6"
|
liquid = "0.26.6"
|
||||||
rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = "0.5.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
ureq = { version = "2.10.0", features = ["json"] }
|
ureq = { version = "2.10.0", features = ["json"] }
|
||||||
@ -98,16 +98,7 @@ rand = { version = "0.8.5", features = ["small_rng"] }
|
|||||||
|
|
||||||
[features]
|
[features]
|
||||||
all-tokenizations = [
|
all-tokenizations = [
|
||||||
"charabia/chinese",
|
"charabia/default",
|
||||||
"charabia/hebrew",
|
|
||||||
"charabia/japanese",
|
|
||||||
"charabia/thai",
|
|
||||||
"charabia/korean",
|
|
||||||
"charabia/greek",
|
|
||||||
"charabia/khmer",
|
|
||||||
"charabia/vietnamese",
|
|
||||||
"charabia/swedish-recomposition",
|
|
||||||
"charabia/german-segmentation",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||||
@ -146,5 +137,8 @@ german = ["charabia/german-segmentation"]
|
|||||||
# force swedish character recomposition
|
# force swedish character recomposition
|
||||||
swedish-recomposition = ["charabia/swedish-recomposition"]
|
swedish-recomposition = ["charabia/swedish-recomposition"]
|
||||||
|
|
||||||
|
# allow turkish specialized tokenization
|
||||||
|
turkish = ["charabia/turkish"]
|
||||||
|
|
||||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||||
cuda = ["candle-core/cuda"]
|
cuda = ["candle-core/cuda"]
|
||||||
|
@ -298,6 +298,7 @@ impl From<arroy::Error> for Error {
|
|||||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||||
}
|
}
|
||||||
|
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
||||||
arroy::Error::DatabaseFull
|
arroy::Error::DatabaseFull
|
||||||
| arroy::Error::InvalidItemAppend
|
| arroy::Error::InvalidItemAppend
|
||||||
| arroy::Error::UnmatchingDistance { .. }
|
| arroy::Error::UnmatchingDistance { .. }
|
||||||
|
@ -1610,24 +1610,6 @@ impl Index {
|
|||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn arroy_readers<'a>(
|
|
||||||
&'a self,
|
|
||||||
rtxn: &'a RoTxn<'a>,
|
|
||||||
embedder_id: u8,
|
|
||||||
quantized: bool,
|
|
||||||
) -> impl Iterator<Item = Result<ArroyWrapper>> + 'a {
|
|
||||||
crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| {
|
|
||||||
let reader = ArroyWrapper::new(self.vector_arroy, k, quantized);
|
|
||||||
// Here we don't care about the dimensions, but we want to know if we can read
|
|
||||||
// in the database or if its metadata are missing because there is no document with that many vectors.
|
|
||||||
match reader.dimensions(rtxn) {
|
|
||||||
Ok(_) => Some(Ok(reader)),
|
|
||||||
Err(arroy::Error::MissingMetadata(_)) => None,
|
|
||||||
Err(e) => Some(Err(e.into())),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||||
}
|
}
|
||||||
@ -1649,14 +1631,9 @@ impl Index {
|
|||||||
let embedding_configs = self.embedding_configs(rtxn)?;
|
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||||
for config in embedding_configs {
|
for config in embedding_configs {
|
||||||
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||||
let embeddings = self
|
let reader =
|
||||||
.arroy_readers(rtxn, embedder_id, config.config.quantized())
|
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||||
.map_while(|reader| {
|
let embeddings = reader.item_vectors(rtxn, docid)?;
|
||||||
reader
|
|
||||||
.and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into()))
|
|
||||||
.transpose()
|
|
||||||
})
|
|
||||||
.collect::<Result<Vec<_>>>()?;
|
|
||||||
res.insert(config.name.to_owned(), embeddings);
|
res.insert(config.name.to_owned(), embeddings);
|
||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
|
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
use super::matching_words::WordId;
|
||||||
|
use super::{Match, MatchPosition};
|
||||||
|
|
||||||
|
struct MatchIntervalWithScore {
|
||||||
|
interval: [usize; 2],
|
||||||
|
score: [i16; 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
// count score for phrases
|
||||||
|
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
|
||||||
|
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||||
|
// will always be ordered, so +1 for each space between words
|
||||||
|
*order_score += words_in_phrase_minus_one;
|
||||||
|
// distance will always be 1, so -1 for each space between words
|
||||||
|
*distance_score -= words_in_phrase_minus_one;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the score of a match interval:
|
||||||
|
/// 1) count unique matches
|
||||||
|
/// 2) calculate distance between matches
|
||||||
|
/// 3) count ordered matches
|
||||||
|
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
|
||||||
|
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||||
|
let mut order_score = 0;
|
||||||
|
let mut distance_score = 0;
|
||||||
|
|
||||||
|
let mut iter = matches.iter().peekable();
|
||||||
|
while let Some(m) = iter.next() {
|
||||||
|
if let Some(next_match) = iter.peek() {
|
||||||
|
// if matches are ordered
|
||||||
|
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||||
|
order_score += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let m_last_word_pos = match m.position {
|
||||||
|
MatchPosition::Word { word_position, .. } => word_position,
|
||||||
|
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
|
||||||
|
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||||
|
lwp
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||||
|
|
||||||
|
// compute distance between matches
|
||||||
|
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
|
||||||
|
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
|
||||||
|
// in case last match is a phrase, count score for its words
|
||||||
|
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||||
|
}
|
||||||
|
|
||||||
|
ids.extend(m.ids.iter());
|
||||||
|
}
|
||||||
|
|
||||||
|
ids.sort_unstable();
|
||||||
|
ids.dedup();
|
||||||
|
let uniq_score = ids.len() as i16;
|
||||||
|
|
||||||
|
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||||
|
[uniq_score, distance_score, order_score]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||||
|
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
|
||||||
|
if matches.is_empty() {
|
||||||
|
panic!("`matches` should not be empty at this point");
|
||||||
|
}
|
||||||
|
|
||||||
|
// positions of the first and the last match of the best matches interval in `matches`.
|
||||||
|
let mut best_interval: Option<MatchIntervalWithScore> = None;
|
||||||
|
|
||||||
|
let mut save_best_interval = |interval_first, interval_last| {
|
||||||
|
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
|
||||||
|
let is_interval_score_better = &best_interval
|
||||||
|
.as_ref()
|
||||||
|
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
|
||||||
|
|
||||||
|
if *is_interval_score_better {
|
||||||
|
best_interval = Some(MatchIntervalWithScore {
|
||||||
|
interval: [interval_first, interval_last],
|
||||||
|
score: interval_score,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// we compute the matches interval if we have at least 2 matches.
|
||||||
|
// current interval positions.
|
||||||
|
let mut interval_first = 0;
|
||||||
|
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||||
|
|
||||||
|
for (index, next_match) in matches.iter().enumerate() {
|
||||||
|
// if next match would make interval gross more than crop_size,
|
||||||
|
// we compare the current interval with the best one,
|
||||||
|
// then we increase `interval_first` until next match can be added.
|
||||||
|
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||||
|
|
||||||
|
// if the next match would mean that we pass the crop size window,
|
||||||
|
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||||
|
// and calculate a score for it, and check if it's better than our best so far
|
||||||
|
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
|
||||||
|
// if index is 0 there is no last viable match
|
||||||
|
if index != 0 {
|
||||||
|
let interval_last = index - 1;
|
||||||
|
// keep interval if it's the best
|
||||||
|
save_best_interval(interval_first, interval_last);
|
||||||
|
}
|
||||||
|
|
||||||
|
// advance start of the interval while interval is longer than crop_size.
|
||||||
|
loop {
|
||||||
|
interval_first += 1;
|
||||||
|
if interval_first == matches.len() {
|
||||||
|
interval_first -= 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||||
|
|
||||||
|
if interval_first_match_first_word_pos > next_match_last_word_pos
|
||||||
|
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute the last interval score and compare it to the best one.
|
||||||
|
let interval_last = matches.len() - 1;
|
||||||
|
// if it's the last match with itself, we need to make sure it's
|
||||||
|
// not a phrase longer than the crop window
|
||||||
|
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
|
||||||
|
save_best_interval(interval_first, interval_last);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if none of the matches fit the criteria above, default to the first one
|
||||||
|
best_interval.map_or(
|
||||||
|
[&matches[0], &matches[0]],
|
||||||
|
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
|
||||||
|
)
|
||||||
|
}
|
62
crates/milli/src/search/new/matches/match.rs
Normal file
62
crates/milli/src/search/new/matches/match.rs
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
use super::matching_words::WordId;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum MatchPosition {
|
||||||
|
Word {
|
||||||
|
// position of the word in the whole text.
|
||||||
|
word_position: usize,
|
||||||
|
// position of the token in the whole text.
|
||||||
|
token_position: usize,
|
||||||
|
},
|
||||||
|
Phrase {
|
||||||
|
// position of the first and last word in the phrase in the whole text.
|
||||||
|
word_positions: [usize; 2],
|
||||||
|
// position of the first and last token in the phrase in the whole text.
|
||||||
|
token_positions: [usize; 2],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Match {
|
||||||
|
pub char_count: usize,
|
||||||
|
// ids of the query words that matches.
|
||||||
|
pub ids: Vec<WordId>,
|
||||||
|
pub position: MatchPosition,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Match {
|
||||||
|
pub(super) fn get_first_word_pos(&self) -> usize {
|
||||||
|
match self.position {
|
||||||
|
MatchPosition::Word { word_position, .. } => word_position,
|
||||||
|
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn get_last_word_pos(&self) -> usize {
|
||||||
|
match self.position {
|
||||||
|
MatchPosition::Word { word_position, .. } => word_position,
|
||||||
|
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn get_first_token_pos(&self) -> usize {
|
||||||
|
match self.position {
|
||||||
|
MatchPosition::Word { token_position, .. } => token_position,
|
||||||
|
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn get_last_token_pos(&self) -> usize {
|
||||||
|
match self.position {
|
||||||
|
MatchPosition::Word { token_position, .. } => token_position,
|
||||||
|
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn get_word_count(&self) -> usize {
|
||||||
|
match self.position {
|
||||||
|
MatchPosition::Word { .. } => 1,
|
||||||
|
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -86,14 +86,17 @@ impl MatchingWords {
|
|||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let prefix_length = char_index + c.len_utf8();
|
let prefix_length = char_index + c.len_utf8();
|
||||||
let char_len = token.original_lengths(prefix_length).0;
|
let (char_count, byte_len) = token.original_lengths(prefix_length);
|
||||||
let ids = &located_words.positions;
|
let ids = &located_words.positions;
|
||||||
return Some(MatchType::Full { char_len, ids });
|
return Some(MatchType::Full { ids, char_count, byte_len });
|
||||||
// else we exact match the token.
|
// else we exact match the token.
|
||||||
} else if token.lemma() == word {
|
} else if token.lemma() == word {
|
||||||
let char_len = token.char_end - token.char_start;
|
|
||||||
let ids = &located_words.positions;
|
let ids = &located_words.positions;
|
||||||
return Some(MatchType::Full { char_len, ids });
|
return Some(MatchType::Full {
|
||||||
|
char_count: token.char_end - token.char_start,
|
||||||
|
byte_len: token.byte_end - token.byte_start,
|
||||||
|
ids,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,7 +133,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> {
|
|||||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
|
let partial = PartialMatch { matching_words: words, ids };
|
||||||
|
|
||||||
partial.match_token(self.token).or_else(|| self.next())
|
partial.match_token(self.token).or_else(|| self.next())
|
||||||
}
|
}
|
||||||
@ -149,7 +152,7 @@ pub type WordId = u16;
|
|||||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum MatchType<'a> {
|
pub enum MatchType<'a> {
|
||||||
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
|
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||||
Partial(PartialMatch<'a>),
|
Partial(PartialMatch<'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,7 +161,6 @@ pub enum MatchType<'a> {
|
|||||||
pub struct PartialMatch<'a> {
|
pub struct PartialMatch<'a> {
|
||||||
matching_words: Vec<Option<&'a str>>,
|
matching_words: Vec<Option<&'a str>>,
|
||||||
ids: &'a RangeInclusive<WordId>,
|
ids: &'a RangeInclusive<WordId>,
|
||||||
char_len: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> PartialMatch<'a> {
|
impl<'a> PartialMatch<'a> {
|
||||||
@ -176,25 +178,24 @@ impl<'a> PartialMatch<'a> {
|
|||||||
None => token.is_stopword(),
|
None => token.is_stopword(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let char_len = token.char_end - token.char_start;
|
|
||||||
// if there are remaining words to match in the phrase and the current token is matching,
|
// if there are remaining words to match in the phrase and the current token is matching,
|
||||||
// return a new Partial match allowing the highlighter to continue.
|
// return a new Partial match allowing the highlighter to continue.
|
||||||
if is_matching && matching_words.len() > 1 {
|
if is_matching && matching_words.len() > 1 {
|
||||||
matching_words.remove(0);
|
matching_words.remove(0);
|
||||||
Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
|
Some(MatchType::Partial(Self { matching_words, ids }))
|
||||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||||
// return a Full match.
|
// return a Full match.
|
||||||
} else if is_matching {
|
} else if is_matching {
|
||||||
Some(MatchType::Full { char_len, ids })
|
Some(MatchType::Full {
|
||||||
|
char_count: token.char_end - token.char_start,
|
||||||
|
byte_len: token.byte_end - token.byte_start,
|
||||||
|
ids,
|
||||||
|
})
|
||||||
// if the current token doesn't match, return None to break the match sequence.
|
// if the current token doesn't match, return None to break the match sequence.
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn char_len(&self) -> usize {
|
|
||||||
self.char_len
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for MatchingWords {
|
impl fmt::Debug for MatchingWords {
|
||||||
@ -276,7 +277,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
@ -300,7 +301,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
@ -312,7 +313,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
|
@ -1,11 +1,19 @@
|
|||||||
use std::borrow::Cow;
|
mod best_match_interval;
|
||||||
|
mod r#match;
|
||||||
|
mod matching_words;
|
||||||
|
mod simple_token_kind;
|
||||||
|
|
||||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||||
|
use either::Either;
|
||||||
pub use matching_words::MatchingWords;
|
pub use matching_words::MatchingWords;
|
||||||
use matching_words::{MatchType, PartialMatch, WordId};
|
use matching_words::{MatchType, PartialMatch};
|
||||||
|
use r#match::{Match, MatchPosition};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use simple_token_kind::SimpleTokenKind;
|
||||||
pub mod matching_words;
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
cmp::{max, min},
|
||||||
|
};
|
||||||
|
|
||||||
const DEFAULT_CROP_MARKER: &str = "…";
|
const DEFAULT_CROP_MARKER: &str = "…";
|
||||||
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
||||||
@ -93,17 +101,6 @@ impl FormatOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct Match {
|
|
||||||
match_len: usize,
|
|
||||||
// ids of the query words that matches.
|
|
||||||
ids: Vec<WordId>,
|
|
||||||
// position of the word in the whole text.
|
|
||||||
word_position: usize,
|
|
||||||
// position of the token in the whole text.
|
|
||||||
token_position: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct MatchBounds {
|
pub struct MatchBounds {
|
||||||
pub start: usize,
|
pub start: usize,
|
||||||
@ -130,41 +127,27 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
/// compute_partial_match peek into next words to validate if the match is complete.
|
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||||
fn compute_partial_match<'a>(
|
fn compute_partial_match<'a>(
|
||||||
mut partial: PartialMatch<'a>,
|
mut partial: PartialMatch<'a>,
|
||||||
token_position: usize,
|
first_token_position: usize,
|
||||||
word_position: usize,
|
first_word_position: usize,
|
||||||
|
first_word_char_start: &usize,
|
||||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||||
matches: &mut Vec<Match>,
|
matches: &mut Vec<Match>,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
|
|
||||||
|
|
||||||
for (token_position, word_position, word) in words_positions {
|
for (token_position, word_position, word) in words_positions {
|
||||||
partial = match partial.match_token(word) {
|
partial = match partial.match_token(word) {
|
||||||
// token matches the partial match, but the match is not full,
|
// token matches the partial match, but the match is not full,
|
||||||
// we temporarily save the current token then we try to match the next one.
|
// we temporarily save the current token then we try to match the next one.
|
||||||
Some(MatchType::Partial(partial)) => {
|
Some(MatchType::Partial(partial)) => partial,
|
||||||
potential_matches.push((token_position, word_position, partial.char_len()));
|
|
||||||
partial
|
|
||||||
}
|
|
||||||
// partial match is now full, we keep this matches and we advance positions
|
// partial match is now full, we keep this matches and we advance positions
|
||||||
Some(MatchType::Full { char_len, ids }) => {
|
Some(MatchType::Full { ids, .. }) => {
|
||||||
let ids: Vec<_> = ids.clone().collect();
|
|
||||||
// save previously matched tokens as matches.
|
|
||||||
let iter = potential_matches.into_iter().map(
|
|
||||||
|(token_position, word_position, match_len)| Match {
|
|
||||||
match_len,
|
|
||||||
ids: ids.clone(),
|
|
||||||
word_position,
|
|
||||||
token_position,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
matches.extend(iter);
|
|
||||||
|
|
||||||
// save the token that closes the partial match as a match.
|
// save the token that closes the partial match as a match.
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
char_count: word.char_end - *first_word_char_start,
|
||||||
ids,
|
ids: ids.clone().collect(),
|
||||||
word_position,
|
position: MatchPosition::Phrase {
|
||||||
token_position,
|
word_positions: [first_word_position, word_position],
|
||||||
|
token_positions: [first_token_position, token_position],
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// the match is complete, we return true.
|
// the match is complete, we return true.
|
||||||
@ -202,13 +185,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
match match_type {
|
match match_type {
|
||||||
// we match, we save the current token as a match,
|
// we match, we save the current token as a match,
|
||||||
// then we continue the rest of the tokens.
|
// then we continue the rest of the tokens.
|
||||||
MatchType::Full { char_len, ids } => {
|
MatchType::Full { ids, char_count, .. } => {
|
||||||
let ids: Vec<_> = ids.clone().collect();
|
let ids: Vec<_> = ids.clone().collect();
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
char_count,
|
||||||
ids,
|
ids,
|
||||||
word_position,
|
position: MatchPosition::Word { word_position, token_position },
|
||||||
token_position,
|
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -221,6 +203,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
partial,
|
partial,
|
||||||
token_position,
|
token_position,
|
||||||
word_position,
|
word_position,
|
||||||
|
&word.char_start,
|
||||||
&mut wp,
|
&mut wp,
|
||||||
&mut matches,
|
&mut matches,
|
||||||
) {
|
) {
|
||||||
@ -243,56 +226,99 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
Some((tokens, matches)) => matches
|
Some((tokens, matches)) => matches
|
||||||
.iter()
|
.iter()
|
||||||
.map(|m| MatchBounds {
|
.map(|m| MatchBounds {
|
||||||
start: tokens[m.token_position].byte_start,
|
start: tokens[m.get_first_token_pos()].byte_start,
|
||||||
length: m.match_len,
|
// TODO: Why is this in chars, while start is in bytes?
|
||||||
|
length: m.char_count,
|
||||||
})
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the bounds in byte index of the crop window.
|
/// Returns the bounds in byte index of the crop window.
|
||||||
fn crop_bounds(
|
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||||
&self,
|
let (
|
||||||
tokens: &[Token<'_>],
|
mut remaining_words,
|
||||||
matches: &[Match],
|
is_iterating_forward,
|
||||||
crop_size: usize,
|
before_tokens_starting_index,
|
||||||
) -> (usize, usize) {
|
after_tokens_starting_index,
|
||||||
// if there is no match, we start from the beginning of the string by default.
|
) = if !matches.is_empty() {
|
||||||
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
let [matches_first, matches_last] =
|
||||||
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
best_match_interval::find_best_match_interval(matches, crop_size);
|
||||||
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
|
||||||
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
|
||||||
|
|
||||||
// matches needs to be counted in the crop len.
|
let matches_size =
|
||||||
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
|
||||||
|
|
||||||
|
let is_crop_size_gte_match_size = crop_size >= matches_size;
|
||||||
|
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
|
||||||
|
|
||||||
|
let remaining_words = if is_crop_size_gte_match_size {
|
||||||
|
crop_size - matches_size
|
||||||
|
} else {
|
||||||
|
// in case matches size is greater than crop size, which implies there's only one match,
|
||||||
|
// we count words backwards, because we have to remove words, as they're extra words outside of
|
||||||
|
// crop window
|
||||||
|
matches_size - crop_size
|
||||||
|
};
|
||||||
|
|
||||||
|
let after_tokens_starting_index = if matches_size == 0 {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
|
||||||
|
if last_match_last_token_position_plus_one < tokens.len() {
|
||||||
|
last_match_last_token_position_plus_one
|
||||||
|
} else {
|
||||||
|
// we have matched the end of possible tokens, there's nothing to advance
|
||||||
|
tokens.len() - 1
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
(
|
||||||
|
remaining_words,
|
||||||
|
is_iterating_forward,
|
||||||
|
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
|
||||||
|
after_tokens_starting_index,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(crop_size, true, 0, 0)
|
||||||
|
};
|
||||||
|
|
||||||
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||||
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||||
let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
|
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
|
||||||
// an iterator starting from the last match token position and going towards the end of the text.
|
// an iterator ...
|
||||||
let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
|
let mut after_tokens = if is_iterating_forward {
|
||||||
|
// ... starting from the last match token position and going towards the end of the text.
|
||||||
|
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
|
||||||
|
} else {
|
||||||
|
// ... starting from the last match token position and going towards the start of the text.
|
||||||
|
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
|
||||||
|
};
|
||||||
|
|
||||||
// grows the crop window peeking in both directions
|
// grows the crop window peeking in both directions
|
||||||
// until the window contains the good number of words:
|
// until the window contains the good number of words:
|
||||||
while remaining_words > 0 {
|
while remaining_words > 0 {
|
||||||
let before_token = before_tokens.peek().map(|t| t.separator_kind());
|
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
|
||||||
let after_token = after_tokens.peek().map(|t| t.separator_kind());
|
let after_token_kind =
|
||||||
|
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
|
||||||
|
|
||||||
match (before_token, after_token) {
|
match (before_token_kind, after_token_kind) {
|
||||||
// we can expand both sides.
|
// we can expand both sides.
|
||||||
(Some(before_token), Some(after_token)) => {
|
(Some(before_token_kind), Some(after_token_kind)) => {
|
||||||
match (before_token, after_token) {
|
match (before_token_kind, after_token_kind) {
|
||||||
// if they are both separators and are the same kind then advance both,
|
// if they are both separators and are the same kind then advance both,
|
||||||
// or expand in the soft separator separator side.
|
// or expand in the soft separator separator side.
|
||||||
(Some(before_token_kind), Some(after_token_kind)) => {
|
(
|
||||||
if before_token_kind == after_token_kind {
|
SimpleTokenKind::Separator(before_token_separator_kind),
|
||||||
|
SimpleTokenKind::Separator(after_token_separator_kind),
|
||||||
|
) => {
|
||||||
|
if before_token_separator_kind == after_token_separator_kind {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
|
|
||||||
// this avoid having an ending separator before crop marker.
|
// this avoid having an ending separator before crop marker.
|
||||||
if remaining_words > 1 {
|
if remaining_words > 1 {
|
||||||
after_tokens.next();
|
after_tokens.next();
|
||||||
}
|
}
|
||||||
} else if before_token_kind == SeparatorKind::Hard {
|
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
|
||||||
after_tokens.next();
|
after_tokens.next();
|
||||||
} else {
|
} else {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
@ -300,17 +326,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
}
|
}
|
||||||
// if one of the tokens is a word, we expend in the side of the word.
|
// if one of the tokens is a word, we expend in the side of the word.
|
||||||
// left is a word, advance left.
|
// left is a word, advance left.
|
||||||
(None, Some(_)) => {
|
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
// right is a word, advance right.
|
// right is a word, advance right.
|
||||||
(Some(_), None) => {
|
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
|
||||||
after_tokens.next();
|
after_tokens.next();
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
// both are words, advance left then right if remaining_word > 0.
|
// both are words, advance left then right if remaining_word > 0.
|
||||||
(None, None) => {
|
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
|
|
||||||
@ -322,16 +348,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the end of the text is reached, advance left.
|
// the end of the text is reached, advance left.
|
||||||
(Some(before_token), None) => {
|
(Some(before_token_kind), None) => {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
if before_token.is_none() {
|
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the start of the text is reached, advance right.
|
// the start of the text is reached, advance right.
|
||||||
(None, Some(after_token)) => {
|
(None, Some(after_token_kind)) => {
|
||||||
after_tokens.next();
|
after_tokens.next();
|
||||||
if after_token.is_none() {
|
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -344,86 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||||
|
|
||||||
(crop_byte_start, crop_byte_end)
|
[crop_byte_start, crop_byte_end]
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute the score of a match interval:
|
|
||||||
/// 1) count unique matches
|
|
||||||
/// 2) calculate distance between matches
|
|
||||||
/// 3) count ordered matches
|
|
||||||
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
|
||||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
|
||||||
let mut order_score = 0;
|
|
||||||
let mut distance_score = 0;
|
|
||||||
|
|
||||||
let mut iter = matches.iter().peekable();
|
|
||||||
while let Some(m) = iter.next() {
|
|
||||||
if let Some(next_match) = iter.peek() {
|
|
||||||
// if matches are ordered
|
|
||||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
|
||||||
order_score += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// compute distance between matches
|
|
||||||
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
|
|
||||||
}
|
|
||||||
|
|
||||||
ids.extend(m.ids.iter());
|
|
||||||
}
|
|
||||||
|
|
||||||
ids.sort_unstable();
|
|
||||||
ids.dedup();
|
|
||||||
let uniq_score = ids.len() as i16;
|
|
||||||
|
|
||||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
|
||||||
(uniq_score, distance_score, order_score)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the matches interval where the score computed by match_interval_score is the best.
|
|
||||||
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
|
|
||||||
// we compute the matches interval if we have at least 2 matches.
|
|
||||||
if matches.len() > 1 {
|
|
||||||
// positions of the first and the last match of the best matches interval in `matches`.
|
|
||||||
let mut best_interval = (0, 0);
|
|
||||||
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
|
|
||||||
// current interval positions.
|
|
||||||
let mut interval_first = 0;
|
|
||||||
let mut interval_last = 0;
|
|
||||||
for (index, next_match) in matches.iter().enumerate().skip(1) {
|
|
||||||
// if next match would make interval gross more than crop_size,
|
|
||||||
// we compare the current interval with the best one,
|
|
||||||
// then we increase `interval_first` until next match can be added.
|
|
||||||
if next_match.word_position - matches[interval_first].word_position >= crop_size {
|
|
||||||
let interval_score =
|
|
||||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
|
||||||
|
|
||||||
// keep interval if it's the best
|
|
||||||
if interval_score > best_interval_score {
|
|
||||||
best_interval = (interval_first, interval_last);
|
|
||||||
best_interval_score = interval_score;
|
|
||||||
}
|
|
||||||
|
|
||||||
// advance start of the interval while interval is longer than crop_size.
|
|
||||||
while next_match.word_position - matches[interval_first].word_position
|
|
||||||
>= crop_size
|
|
||||||
{
|
|
||||||
interval_first += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
interval_last = index;
|
|
||||||
}
|
|
||||||
|
|
||||||
// compute the last interval score and compare it to the best one.
|
|
||||||
let interval_score =
|
|
||||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
|
||||||
if interval_score > best_interval_score {
|
|
||||||
best_interval = (interval_first, interval_last);
|
|
||||||
}
|
|
||||||
|
|
||||||
&matches[best_interval.0..=best_interval.1]
|
|
||||||
} else {
|
|
||||||
matches
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the formatted version of the original text.
|
// Returns the formatted version of the original text.
|
||||||
@ -434,69 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
} else {
|
} else {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
Some((tokens, matches)) => {
|
Some((tokens, matches)) => {
|
||||||
// If the text has to be cropped,
|
// If the text has to be cropped, crop around the best interval.
|
||||||
// crop around the best interval.
|
let [crop_byte_start, crop_byte_end] = match format_options.crop {
|
||||||
let (byte_start, byte_end) = match format_options.crop {
|
|
||||||
Some(crop_size) if crop_size > 0 => {
|
Some(crop_size) if crop_size > 0 => {
|
||||||
let matches = self.find_best_match_interval(matches, crop_size);
|
|
||||||
self.crop_bounds(tokens, matches, crop_size)
|
self.crop_bounds(tokens, matches, crop_size)
|
||||||
}
|
}
|
||||||
_ => (0, self.text.len()),
|
_ => [0, self.text.len()],
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut formatted = Vec::new();
|
let mut formatted = Vec::new();
|
||||||
|
|
||||||
// push crop marker if it's not the start of the text.
|
// push crop marker if it's not the start of the text.
|
||||||
if byte_start > 0 && !self.crop_marker.is_empty() {
|
if crop_byte_start > 0 && !self.crop_marker.is_empty() {
|
||||||
formatted.push(self.crop_marker);
|
formatted.push(self.crop_marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut byte_index = byte_start;
|
let mut byte_index = crop_byte_start;
|
||||||
|
|
||||||
if format_options.highlight {
|
if format_options.highlight {
|
||||||
// insert highlight markers around matches.
|
// insert highlight markers around matches.
|
||||||
for m in matches {
|
for m in matches {
|
||||||
let token = &tokens[m.token_position];
|
let [m_byte_start, m_byte_end] = match m.position {
|
||||||
|
MatchPosition::Word { token_position, .. } => {
|
||||||
|
let token = &tokens[token_position];
|
||||||
|
[&token.byte_start, &token.byte_end]
|
||||||
|
}
|
||||||
|
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
|
||||||
|
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// skip matches out of the crop window.
|
// skip matches out of the crop window
|
||||||
if token.byte_start < byte_start || token.byte_end > byte_end {
|
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if byte_index < token.byte_start {
|
// adjust start and end to the crop window size
|
||||||
formatted.push(&self.text[byte_index..token.byte_start]);
|
let [m_byte_start, m_byte_end] = [
|
||||||
|
max(m_byte_start, &crop_byte_start),
|
||||||
|
min(m_byte_end, &crop_byte_end),
|
||||||
|
];
|
||||||
|
|
||||||
|
// push text that is positioned before our matches
|
||||||
|
if byte_index < *m_byte_start {
|
||||||
|
formatted.push(&self.text[byte_index..*m_byte_start]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let highlight_byte_index = self.text[token.byte_start..]
|
|
||||||
.char_indices()
|
|
||||||
.enumerate()
|
|
||||||
.find(|(i, _)| *i == m.match_len)
|
|
||||||
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
|
||||||
|
// TODO: This is additional work done, charabia::token::Token byte_len
|
||||||
|
// should already get us the original byte length, however, that doesn't work as
|
||||||
|
// it's supposed to, investigate why
|
||||||
|
let highlight_byte_index = self.text[*m_byte_start..]
|
||||||
|
.char_indices()
|
||||||
|
.nth(m.char_count)
|
||||||
|
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
|
||||||
|
formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
|
||||||
|
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||||
if highlight_byte_index < token.byte_end {
|
if highlight_byte_index < *m_byte_end {
|
||||||
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte_index = token.byte_end;
|
byte_index = *m_byte_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// push the rest of the text between last match and the end of crop.
|
// push the rest of the text between last match and the end of crop.
|
||||||
if byte_index < byte_end {
|
if byte_index < crop_byte_end {
|
||||||
formatted.push(&self.text[byte_index..byte_end]);
|
formatted.push(&self.text[byte_index..crop_byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// push crop marker if it's not the end of the text.
|
// push crop marker if it's not the end of the text.
|
||||||
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||||
formatted.push(self.crop_marker);
|
formatted.push(self.crop_marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
if formatted.len() == 1 {
|
if formatted.len() == 1 {
|
||||||
// avoid concatenating if there is already 1 slice.
|
// avoid concatenating if there is already 1 slice.
|
||||||
Cow::Borrowed(&self.text[byte_start..byte_end])
|
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
|
||||||
} else {
|
} else {
|
||||||
Cow::Owned(formatted.concat())
|
Cow::Owned(formatted.concat())
|
||||||
}
|
}
|
||||||
@ -821,22 +786,24 @@ mod tests {
|
|||||||
fn format_highlight_crop_phrase_query() {
|
fn format_highlight_crop_phrase_query() {
|
||||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||||
let temp_index = TempIndex::new();
|
let temp_index = TempIndex::new();
|
||||||
|
|
||||||
|
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||||
temp_index
|
temp_index
|
||||||
.add_documents(documents!([
|
.add_documents(documents!([
|
||||||
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
|
{ "id": 1, "text": text }
|
||||||
]))
|
]))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let rtxn = temp_index.read_txn().unwrap();
|
let rtxn = temp_index.read_txn().unwrap();
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
|
||||||
|
|
||||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"…had the power to split <em>the</em> <em>world</em> between those who…"
|
@"…the power to split <em>the world</em> between those who embraced…"
|
||||||
);
|
);
|
||||||
|
|
||||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||||
@ -844,7 +811,63 @@ mod tests {
|
|||||||
// should highlight "those" and the phrase "and those".
|
// should highlight "those" and the phrase "and those".
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
|
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::new_test(
|
||||||
|
&rtxn,
|
||||||
|
&temp_index,
|
||||||
|
"\"The groundbreaking invention had the power to split the world\"",
|
||||||
|
);
|
||||||
|
let mut matcher = builder.build(text, None);
|
||||||
|
insta::assert_snapshot!(
|
||||||
|
matcher.format(format_options),
|
||||||
|
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::new_test(
|
||||||
|
&rtxn,
|
||||||
|
&temp_index,
|
||||||
|
"\"The groundbreaking invention had the power to split the world between those\"",
|
||||||
|
);
|
||||||
|
let mut matcher = builder.build(text, None);
|
||||||
|
insta::assert_snapshot!(
|
||||||
|
matcher.format(format_options),
|
||||||
|
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::new_test(
|
||||||
|
&rtxn,
|
||||||
|
&temp_index,
|
||||||
|
"\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
|
||||||
|
);
|
||||||
|
let mut matcher = builder.build(text, None);
|
||||||
|
insta::assert_snapshot!(
|
||||||
|
matcher.format(format_options),
|
||||||
|
// TODO: Should include exclamation mark without crop markers
|
||||||
|
@"…between those who <em>embraced progress and those who resisted change</em>…"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::new_test(
|
||||||
|
&rtxn,
|
||||||
|
&temp_index,
|
||||||
|
"\"groundbreaking invention\" \"split the world between\"",
|
||||||
|
);
|
||||||
|
let mut matcher = builder.build(text, None);
|
||||||
|
insta::assert_snapshot!(
|
||||||
|
matcher.format(format_options),
|
||||||
|
@"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::new_test(
|
||||||
|
&rtxn,
|
||||||
|
&temp_index,
|
||||||
|
"\"groundbreaking invention\" \"had the power to split the world between those\"",
|
||||||
|
);
|
||||||
|
let mut matcher = builder.build(text, None);
|
||||||
|
insta::assert_snapshot!(
|
||||||
|
matcher.format(format_options),
|
||||||
|
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -900,7 +923,7 @@ mod tests {
|
|||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
|
@"_the_ _do or_ die can't be he do and or isn'_t he_"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
use charabia::{SeparatorKind, Token, TokenKind};
|
||||||
|
|
||||||
|
pub enum SimpleTokenKind {
|
||||||
|
Separator(SeparatorKind),
|
||||||
|
NotSeparator,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SimpleTokenKind {
|
||||||
|
pub fn new(token: &&Token<'_>) -> Self {
|
||||||
|
match token.kind {
|
||||||
|
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
|
||||||
|
_ => Self::NotSeparator,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,11 +1,10 @@
|
|||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use ordered_float::OrderedFloat;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::vector::{DistributionShift, Embedder};
|
use crate::vector::{ArroyWrapper, DistributionShift, Embedder};
|
||||||
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||||
@ -53,14 +52,9 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
|||||||
vector_candidates: &RoaringBitmap,
|
vector_candidates: &RoaringBitmap,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let target = &self.target;
|
let target = &self.target;
|
||||||
let mut results = Vec::new();
|
|
||||||
|
|
||||||
for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) {
|
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized);
|
||||||
let nns_by_vector =
|
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
||||||
reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
|
||||||
results.extend(nns_by_vector.into_iter());
|
|
||||||
}
|
|
||||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
|
||||||
self.cached_sorted_docids = results.into_iter();
|
self.cached_sorted_docids = results.into_iter();
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use ordered_float::OrderedFloat;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::vector::Embedder;
|
use crate::vector::{ArroyWrapper, Embedder};
|
||||||
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
||||||
|
|
||||||
pub struct Similar<'a> {
|
pub struct Similar<'a> {
|
||||||
@ -71,23 +70,13 @@ impl<'a> Similar<'a> {
|
|||||||
.get(self.rtxn, &self.embedder_name)?
|
.get(self.rtxn, &self.embedder_name)?
|
||||||
.ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?;
|
.ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?;
|
||||||
|
|
||||||
let mut results = Vec::new();
|
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
|
||||||
|
let results = reader.nns_by_item(
|
||||||
for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) {
|
self.rtxn,
|
||||||
let nns_by_item = reader?.nns_by_item(
|
self.id,
|
||||||
self.rtxn,
|
self.limit + self.offset + 1,
|
||||||
self.id,
|
Some(&universe),
|
||||||
self.limit + self.offset + 1,
|
)?;
|
||||||
Some(&universe),
|
|
||||||
)?;
|
|
||||||
if let Some(mut nns_by_item) = nns_by_item {
|
|
||||||
results.append(&mut nns_by_item);
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
|
||||||
|
|
||||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||||
let mut document_scores = Vec::with_capacity(self.limit);
|
let mut document_scores = Vec::with_capacity(self.limit);
|
||||||
|
@ -689,9 +689,8 @@ where
|
|||||||
key: None,
|
key: None,
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap();
|
|
||||||
let reader =
|
let reader =
|
||||||
ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized);
|
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
||||||
let dim = reader.dimensions(self.wtxn)?;
|
let dim = reader.dimensions(self.wtxn)?;
|
||||||
dimension.insert(name.to_string(), dim);
|
dimension.insert(name.to_string(), dim);
|
||||||
}
|
}
|
||||||
@ -700,6 +699,7 @@ where
|
|||||||
for (embedder_name, dimension) in dimension {
|
for (embedder_name, dimension) in dimension {
|
||||||
let wtxn = &mut *self.wtxn;
|
let wtxn = &mut *self.wtxn;
|
||||||
let vector_arroy = self.index.vector_arroy;
|
let vector_arroy = self.index.vector_arroy;
|
||||||
|
let cancel = &self.should_abort;
|
||||||
|
|
||||||
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||||
@ -713,17 +713,8 @@ where
|
|||||||
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
|
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
|
||||||
|
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
||||||
let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized);
|
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?;
|
||||||
if is_quantizing {
|
|
||||||
writer.quantize(wtxn, k, dimension)?;
|
|
||||||
}
|
|
||||||
if writer.need_build(wtxn, dimension)? {
|
|
||||||
writer.build(wtxn, &mut rng, dimension)?;
|
|
||||||
} else if writer.is_empty(wtxn, dimension)? {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
})
|
})
|
||||||
.map_err(InternalError::from)??;
|
.map_err(InternalError::from)??;
|
||||||
|
@ -990,27 +990,24 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let readers: Result<BTreeMap<&str, (Vec<ArroyWrapper>, &RoaringBitmap)>> = settings_diff
|
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff
|
||||||
.embedding_config_updates
|
.embedding_config_updates
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(name, action)| {
|
.filter_map(|(name, action)| {
|
||||||
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
|
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
|
||||||
action.write_back()
|
action.write_back()
|
||||||
{
|
{
|
||||||
let readers: Result<Vec<_>> = self
|
let reader = ArroyWrapper::new(
|
||||||
.index
|
self.index.vector_arroy,
|
||||||
.arroy_readers(wtxn, *embedder_id, action.was_quantized)
|
*embedder_id,
|
||||||
.collect();
|
action.was_quantized,
|
||||||
match readers {
|
);
|
||||||
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
|
Some((name.as_str(), (reader, user_provided)))
|
||||||
Err(error) => Some(Err(error)),
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
let readers = readers?;
|
|
||||||
|
|
||||||
let old_vectors_fid = settings_diff
|
let old_vectors_fid = settings_diff
|
||||||
.old
|
.old
|
||||||
@ -1048,34 +1045,24 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
arroy::Error,
|
arroy::Error,
|
||||||
> = readers
|
> = readers
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(name, (readers, user_provided))| {
|
.filter_map(|(name, (reader, user_provided))| {
|
||||||
if !user_provided.contains(docid) {
|
if !user_provided.contains(docid) {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let mut vectors = Vec::new();
|
match reader.item_vectors(wtxn, docid) {
|
||||||
for reader in readers {
|
Ok(vectors) if vectors.is_empty() => None,
|
||||||
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
|
Ok(vectors) => Some(Ok((
|
||||||
break;
|
name.to_string(),
|
||||||
};
|
serde_json::to_value(ExplicitVectors {
|
||||||
|
embeddings: Some(
|
||||||
match vector {
|
VectorOrArrayOfVectors::from_array_of_vectors(vectors),
|
||||||
Ok(vector) => vectors.push(vector),
|
),
|
||||||
Err(error) => return Some(Err(error)),
|
regenerate: false,
|
||||||
}
|
})
|
||||||
|
.unwrap(),
|
||||||
|
))),
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
}
|
}
|
||||||
if vectors.is_empty() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
Some(Ok((
|
|
||||||
name.to_string(),
|
|
||||||
serde_json::to_value(ExplicitVectors {
|
|
||||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
|
||||||
vectors,
|
|
||||||
)),
|
|
||||||
regenerate: false,
|
|
||||||
})
|
|
||||||
.unwrap(),
|
|
||||||
)))
|
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@ -1104,11 +1091,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// delete all vectors from the embedders that need removal
|
// delete all vectors from the embedders that need removal
|
||||||
for (_, (readers, _)) in readers {
|
for (_, (reader, _)) in readers {
|
||||||
for reader in readers {
|
let dimensions = reader.dimensions(wtxn)?;
|
||||||
let dimensions = reader.dimensions(wtxn)?;
|
reader.clear(wtxn, dimensions)?;
|
||||||
reader.clear(wtxn, dimensions)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let grenad_params = GrenadParameters {
|
let grenad_params = GrenadParameters {
|
||||||
|
@ -673,22 +673,14 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
.get(&embedder_name)
|
.get(&embedder_name)
|
||||||
.map_or(false, |conf| conf.2);
|
.map_or(false, |conf| conf.2);
|
||||||
// FIXME: allow customizing distance
|
// FIXME: allow customizing distance
|
||||||
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
||||||
.map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// remove vectors for docids we want them removed
|
// remove vectors for docids we want them removed
|
||||||
let merger = remove_vectors_builder.build();
|
let merger = remove_vectors_builder.build();
|
||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, _)) = iter.next()? {
|
while let Some((key, _)) = iter.next()? {
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
writer.del_items(wtxn, expected_dimension, docid)?;
|
||||||
for writer in &writers {
|
|
||||||
// Uses invariant: vectors are packed in the first writers.
|
|
||||||
if !writer.del_item(wtxn, expected_dimension, docid)? {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add generated embeddings
|
// add generated embeddings
|
||||||
@ -716,9 +708,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
embeddings.embedding_count(),
|
embeddings.embedding_count(),
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
writer.add_items(wtxn, docid, &embeddings)?;
|
||||||
writer.add_item(wtxn, expected_dimension, docid, embedding)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// perform the manual diff
|
// perform the manual diff
|
||||||
@ -733,51 +723,14 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||||
|
|
||||||
let mut deleted_index = None;
|
writer.del_item(wtxn, docid, &vector)?;
|
||||||
for (index, writer) in writers.iter().enumerate() {
|
|
||||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
|
||||||
// uses invariant: vectors are packed in the first writers.
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
if candidate == vector {
|
|
||||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
|
||||||
deleted_index = Some(index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 🥲 enforce invariant: vectors are packed in the first writers.
|
|
||||||
if let Some(deleted_index) = deleted_index {
|
|
||||||
let mut last_index_with_a_vector = None;
|
|
||||||
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
|
|
||||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
last_index_with_a_vector = Some((index, candidate));
|
|
||||||
}
|
|
||||||
if let Some((last_index, vector)) = last_index_with_a_vector {
|
|
||||||
// unwrap: computed the index from the list of writers
|
|
||||||
let writer = writers.get(last_index).unwrap();
|
|
||||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
|
||||||
writers.get(deleted_index).unwrap().add_item(
|
|
||||||
wtxn,
|
|
||||||
expected_dimension,
|
|
||||||
docid,
|
|
||||||
&vector,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||||
let vector = pod_collect_to_vec(value);
|
let vector = pod_collect_to_vec(value);
|
||||||
|
|
||||||
// overflow was detected during vector extraction.
|
// overflow was detected during vector extraction.
|
||||||
for writer in &writers {
|
writer.add_item(wtxn, docid, &vector)?;
|
||||||
if !writer.contains_item(wtxn, expected_dimension, docid)? {
|
|
||||||
writer.add_item(wtxn, expected_dimension, docid, &vector)?;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arroy::distances::{Angular, BinaryQuantizedAngular};
|
use arroy::distances::{BinaryQuantizedCosine, Cosine};
|
||||||
use arroy::ItemId;
|
use arroy::ItemId;
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
use heed::{RoTxn, RwTxn, Unspecified};
|
use heed::{RoTxn, RwTxn, Unspecified};
|
||||||
@ -32,105 +32,243 @@ pub const REQUEST_PARALLELISM: usize = 40;
|
|||||||
|
|
||||||
pub struct ArroyWrapper {
|
pub struct ArroyWrapper {
|
||||||
quantized: bool,
|
quantized: bool,
|
||||||
index: u16,
|
embedder_index: u8,
|
||||||
database: arroy::Database<Unspecified>,
|
database: arroy::Database<Unspecified>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArroyWrapper {
|
impl ArroyWrapper {
|
||||||
pub fn new(database: arroy::Database<Unspecified>, index: u16, quantized: bool) -> Self {
|
pub fn new(
|
||||||
Self { database, index, quantized }
|
database: arroy::Database<Unspecified>,
|
||||||
|
embedder_index: u8,
|
||||||
|
quantized: bool,
|
||||||
|
) -> Self {
|
||||||
|
Self { database, embedder_index, quantized }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn index(&self) -> u16 {
|
pub fn embedder_index(&self) -> u8 {
|
||||||
self.index
|
self.embedder_index
|
||||||
|
}
|
||||||
|
|
||||||
|
fn readers<'a, D: arroy::Distance>(
|
||||||
|
&'a self,
|
||||||
|
rtxn: &'a RoTxn<'a>,
|
||||||
|
db: arroy::Database<D>,
|
||||||
|
) -> impl Iterator<Item = Result<arroy::Reader<D>, arroy::Error>> + 'a {
|
||||||
|
arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| {
|
||||||
|
match arroy::Reader::open(rtxn, index, db) {
|
||||||
|
Ok(reader) => match reader.is_empty(rtxn) {
|
||||||
|
Ok(false) => Some(Ok(reader)),
|
||||||
|
Ok(true) => None,
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
|
},
|
||||||
|
Err(arroy::Error::MissingMetadata(_)) => None,
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> {
|
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> {
|
||||||
|
let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap();
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions())
|
Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions())
|
||||||
} else {
|
} else {
|
||||||
Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions())
|
Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn quantize(
|
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
|
||||||
&mut self,
|
&mut self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: u16,
|
rng: &mut R,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
|
quantizing: bool,
|
||||||
|
cancel: &(impl Fn() -> bool + Sync + Send),
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), arroy::Error> {
|
||||||
if !self.quantized {
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
if self.quantized {
|
||||||
writer.prepare_changing_distance::<BinaryQuantizedAngular>(wtxn)?;
|
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
self.quantized = true;
|
if writer.need_build(wtxn)? {
|
||||||
|
writer.builder(rng).build(wtxn)?
|
||||||
|
} else if writer.is_empty(wtxn)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||||
|
// If we are quantizing the databases, we can't know from meilisearch
|
||||||
|
// if the db was empty but still contained the wrong metadata, thus we need
|
||||||
|
// to quantize everything and can't stop early. Since this operation can
|
||||||
|
// only happens once in the life of an embedder, it's not very performances
|
||||||
|
// sensitive.
|
||||||
|
if quantizing && !self.quantized {
|
||||||
|
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
|
||||||
|
writer.builder(rng).cancel(cancel).build(wtxn)?;
|
||||||
|
} else if writer.need_build(wtxn)? {
|
||||||
|
writer.builder(rng).cancel(cancel).build(wtxn)?;
|
||||||
|
} else if writer.is_empty(wtxn)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result<bool, arroy::Error> {
|
/// Overwrite all the embeddings associated with the index and item ID.
|
||||||
if self.quantized {
|
/// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings.
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn)
|
/// You should call `del_items` on the `item_id` before calling this method.
|
||||||
} else {
|
/// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored.
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn)
|
pub fn add_items(
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build<R: rand::Rng + rand::SeedableRng>(
|
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
rng: &mut R,
|
item_id: arroy::ItemId,
|
||||||
dimension: usize,
|
embeddings: &Embeddings<f32>,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), arroy::Error> {
|
||||||
if self.quantized {
|
let dimension = embeddings.dimension();
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None)
|
for (index, vector) in
|
||||||
} else {
|
arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None)
|
{
|
||||||
|
if self.quantized {
|
||||||
|
arroy::Writer::new(self.quantized_db(), index, dimension)
|
||||||
|
.add_item(wtxn, item_id, vector)?
|
||||||
|
} else {
|
||||||
|
arroy::Writer::new(self.angular_db(), index, dimension)
|
||||||
|
.add_item(wtxn, item_id, vector)?
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Add one document int for this index where we can find an empty spot.
|
||||||
pub fn add_item(
|
pub fn add_item(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
dimension: usize,
|
|
||||||
item_id: arroy::ItemId,
|
item_id: arroy::ItemId,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), arroy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension)
|
self._add_item(wtxn, self.quantized_db(), item_id, vector)
|
||||||
.add_item(wtxn, item_id, vector)
|
|
||||||
} else {
|
} else {
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension)
|
self._add_item(wtxn, self.angular_db(), item_id, vector)
|
||||||
.add_item(wtxn, item_id, vector)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn del_item(
|
fn _add_item<D: arroy::Distance>(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
db: arroy::Database<D>,
|
||||||
|
item_id: arroy::ItemId,
|
||||||
|
vector: &[f32],
|
||||||
|
) -> Result<(), arroy::Error> {
|
||||||
|
let dimension = vector.len();
|
||||||
|
|
||||||
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
|
let writer = arroy::Writer::new(db, index, dimension);
|
||||||
|
if !writer.contains_item(wtxn, item_id)? {
|
||||||
|
writer.add_item(wtxn, item_id, vector)?;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete all embeddings from a specific `item_id`
|
||||||
|
pub fn del_items(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
item_id: arroy::ItemId,
|
item_id: arroy::ItemId,
|
||||||
|
) -> Result<(), arroy::Error> {
|
||||||
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
|
if self.quantized {
|
||||||
|
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
|
if !writer.del_item(wtxn, item_id)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||||
|
if !writer.del_item(wtxn, item_id)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete one item.
|
||||||
|
pub fn del_item(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
item_id: arroy::ItemId,
|
||||||
|
vector: &[f32],
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, arroy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).del_item(wtxn, item_id)
|
self._del_item(wtxn, self.quantized_db(), item_id, vector)
|
||||||
} else {
|
} else {
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).del_item(wtxn, item_id)
|
self._del_item(wtxn, self.angular_db(), item_id, vector)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn _del_item<D: arroy::Distance>(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
db: arroy::Database<D>,
|
||||||
|
item_id: arroy::ItemId,
|
||||||
|
vector: &[f32],
|
||||||
|
) -> Result<bool, arroy::Error> {
|
||||||
|
let dimension = vector.len();
|
||||||
|
let mut deleted_index = None;
|
||||||
|
|
||||||
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
|
let writer = arroy::Writer::new(db, index, dimension);
|
||||||
|
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
|
||||||
|
// uses invariant: vectors are packed in the first writers.
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
if candidate == vector {
|
||||||
|
writer.del_item(wtxn, item_id)?;
|
||||||
|
deleted_index = Some(index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 🥲 enforce invariant: vectors are packed in the first writers.
|
||||||
|
if let Some(deleted_index) = deleted_index {
|
||||||
|
let mut last_index_with_a_vector = None;
|
||||||
|
for index in
|
||||||
|
arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize)
|
||||||
|
{
|
||||||
|
let writer = arroy::Writer::new(db, index, dimension);
|
||||||
|
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
last_index_with_a_vector = Some((index, candidate));
|
||||||
|
}
|
||||||
|
if let Some((last_index, vector)) = last_index_with_a_vector {
|
||||||
|
let writer = arroy::Writer::new(db, last_index, dimension);
|
||||||
|
writer.del_item(wtxn, item_id)?;
|
||||||
|
let writer = arroy::Writer::new(db, deleted_index, dimension);
|
||||||
|
writer.add_item(wtxn, item_id, &vector)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(deleted_index.is_some())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
|
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
|
||||||
if self.quantized {
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).clear(wtxn)
|
if self.quantized {
|
||||||
} else {
|
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).clear(wtxn)
|
if writer.is_empty(wtxn)? {
|
||||||
}
|
break;
|
||||||
}
|
}
|
||||||
|
writer.clear(wtxn)?;
|
||||||
pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result<bool, arroy::Error> {
|
} else {
|
||||||
if self.quantized {
|
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).is_empty(rtxn)
|
if writer.is_empty(wtxn)? {
|
||||||
} else {
|
break;
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).is_empty(rtxn)
|
}
|
||||||
|
writer.clear(wtxn)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn contains_item(
|
pub fn contains_item(
|
||||||
@ -139,11 +277,25 @@ impl ArroyWrapper {
|
|||||||
dimension: usize,
|
dimension: usize,
|
||||||
item: arroy::ItemId,
|
item: arroy::ItemId,
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, arroy::Error> {
|
||||||
if self.quantized {
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).contains_item(rtxn, item)
|
let contains = if self.quantized {
|
||||||
} else {
|
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
arroy::Writer::new(self.angular_db(), self.index, dimension).contains_item(rtxn, item)
|
if writer.is_empty(rtxn)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writer.contains_item(rtxn, item)?
|
||||||
|
} else {
|
||||||
|
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||||
|
if writer.is_empty(rtxn)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writer.contains_item(rtxn, item)?
|
||||||
|
};
|
||||||
|
if contains {
|
||||||
|
return Ok(contains);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn nns_by_item(
|
pub fn nns_by_item(
|
||||||
@ -152,45 +304,108 @@ impl ArroyWrapper {
|
|||||||
item: ItemId,
|
item: ItemId,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Option<Vec<(ItemId, f32)>>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Reader::open(rtxn, self.index, self.quantized_db())?
|
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter)
|
||||||
.nns_by_item(rtxn, item, limit, None, None, filter)
|
|
||||||
} else {
|
} else {
|
||||||
arroy::Reader::open(rtxn, self.index, self.angular_db())?
|
self._nns_by_item(rtxn, self.angular_db(), item, limit, filter)
|
||||||
.nns_by_item(rtxn, item, limit, None, None, filter)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn _nns_by_item<D: arroy::Distance>(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
db: arroy::Database<D>,
|
||||||
|
item: ItemId,
|
||||||
|
limit: usize,
|
||||||
|
filter: Option<&RoaringBitmap>,
|
||||||
|
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for reader in self.readers(rtxn, db) {
|
||||||
|
let reader = reader?;
|
||||||
|
let mut searcher = reader.nns(limit);
|
||||||
|
if let Some(filter) = filter {
|
||||||
|
searcher.candidates(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
|
||||||
|
results.append(&mut ret);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn nns_by_vector(
|
pub fn nns_by_vector(
|
||||||
&self,
|
&self,
|
||||||
txn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
item: &[f32],
|
vector: &[f32],
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Reader::open(txn, self.index, self.quantized_db())?
|
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter)
|
||||||
.nns_by_vector(txn, item, limit, None, None, filter)
|
|
||||||
} else {
|
} else {
|
||||||
arroy::Reader::open(txn, self.index, self.angular_db())?
|
self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter)
|
||||||
.nns_by_vector(txn, item, limit, None, None, filter)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result<Option<Vec<f32>>, arroy::Error> {
|
fn _nns_by_vector<D: arroy::Distance>(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
db: arroy::Database<D>,
|
||||||
|
vector: &[f32],
|
||||||
|
limit: usize,
|
||||||
|
filter: Option<&RoaringBitmap>,
|
||||||
|
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for reader in self.readers(rtxn, db) {
|
||||||
|
let reader = reader?;
|
||||||
|
let mut searcher = reader.nns(limit);
|
||||||
|
if let Some(filter) = filter {
|
||||||
|
searcher.candidates(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(&mut searcher.by_vector(rtxn, vector)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, arroy::Error> {
|
||||||
|
let mut vectors = Vec::new();
|
||||||
|
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Reader::open(rtxn, self.index, self.quantized_db())?.item_vector(rtxn, docid)
|
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||||
|
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
|
||||||
|
vectors.push(vec);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
arroy::Reader::open(rtxn, self.index, self.angular_db())?.item_vector(rtxn, docid)
|
for reader in self.readers(rtxn, self.angular_db()) {
|
||||||
|
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
|
||||||
|
vectors.push(vec);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(vectors)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn angular_db(&self) -> arroy::Database<Angular> {
|
fn angular_db(&self) -> arroy::Database<Cosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedAngular> {
|
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -417,6 +417,8 @@ impl EmbeddingSettings {
|
|||||||
|
|
||||||
pub const DISTRIBUTION: &'static str = "distribution";
|
pub const DISTRIBUTION: &'static str = "distribution";
|
||||||
|
|
||||||
|
pub const BINARY_QUANTIZED: &'static str = "binaryQuantized";
|
||||||
|
|
||||||
pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] {
|
pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] {
|
||||||
match field {
|
match field {
|
||||||
Self::SOURCE => &[
|
Self::SOURCE => &[
|
||||||
@ -456,6 +458,13 @@ impl EmbeddingSettings {
|
|||||||
EmbedderSource::Rest,
|
EmbedderSource::Rest,
|
||||||
EmbedderSource::UserProvided,
|
EmbedderSource::UserProvided,
|
||||||
],
|
],
|
||||||
|
Self::BINARY_QUANTIZED => &[
|
||||||
|
EmbedderSource::HuggingFace,
|
||||||
|
EmbedderSource::Ollama,
|
||||||
|
EmbedderSource::OpenAi,
|
||||||
|
EmbedderSource::Rest,
|
||||||
|
EmbedderSource::UserProvided,
|
||||||
|
],
|
||||||
_other => unreachable!("unknown field"),
|
_other => unreachable!("unknown field"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -470,6 +479,7 @@ impl EmbeddingSettings {
|
|||||||
Self::DIMENSIONS,
|
Self::DIMENSIONS,
|
||||||
Self::DISTRIBUTION,
|
Self::DISTRIBUTION,
|
||||||
Self::URL,
|
Self::URL,
|
||||||
|
Self::BINARY_QUANTIZED,
|
||||||
],
|
],
|
||||||
EmbedderSource::HuggingFace => &[
|
EmbedderSource::HuggingFace => &[
|
||||||
Self::SOURCE,
|
Self::SOURCE,
|
||||||
@ -477,6 +487,7 @@ impl EmbeddingSettings {
|
|||||||
Self::REVISION,
|
Self::REVISION,
|
||||||
Self::DOCUMENT_TEMPLATE,
|
Self::DOCUMENT_TEMPLATE,
|
||||||
Self::DISTRIBUTION,
|
Self::DISTRIBUTION,
|
||||||
|
Self::BINARY_QUANTIZED,
|
||||||
],
|
],
|
||||||
EmbedderSource::Ollama => &[
|
EmbedderSource::Ollama => &[
|
||||||
Self::SOURCE,
|
Self::SOURCE,
|
||||||
@ -486,8 +497,11 @@ impl EmbeddingSettings {
|
|||||||
Self::API_KEY,
|
Self::API_KEY,
|
||||||
Self::DIMENSIONS,
|
Self::DIMENSIONS,
|
||||||
Self::DISTRIBUTION,
|
Self::DISTRIBUTION,
|
||||||
|
Self::BINARY_QUANTIZED,
|
||||||
],
|
],
|
||||||
EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION],
|
EmbedderSource::UserProvided => {
|
||||||
|
&[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED]
|
||||||
|
}
|
||||||
EmbedderSource::Rest => &[
|
EmbedderSource::Rest => &[
|
||||||
Self::SOURCE,
|
Self::SOURCE,
|
||||||
Self::API_KEY,
|
Self::API_KEY,
|
||||||
@ -498,6 +512,7 @@ impl EmbeddingSettings {
|
|||||||
Self::RESPONSE,
|
Self::RESPONSE,
|
||||||
Self::HEADERS,
|
Self::HEADERS,
|
||||||
Self::DISTRIBUTION,
|
Self::DISTRIBUTION,
|
||||||
|
Self::BINARY_QUANTIZED,
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -77,7 +77,8 @@
|
|||||||
"q": "puppy cute comforting movie",
|
"q": "puppy cute comforting movie",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 0.1
|
"semanticRatio": 0.1,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -91,7 +92,8 @@
|
|||||||
"q": "puppy cute comforting movie",
|
"q": "puppy cute comforting movie",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 0.5
|
"semanticRatio": 0.5,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -105,7 +107,8 @@
|
|||||||
"q": "puppy cute comforting movie",
|
"q": "puppy cute comforting movie",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 0.9
|
"semanticRatio": 0.9,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -119,7 +122,8 @@
|
|||||||
"q": "puppy cute comforting movie",
|
"q": "puppy cute comforting movie",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 1.0
|
"semanticRatio": 1.0,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -133,7 +137,8 @@
|
|||||||
"q": "shrek",
|
"q": "shrek",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 1.0
|
"semanticRatio": 1.0,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -147,7 +152,8 @@
|
|||||||
"q": "shrek",
|
"q": "shrek",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 0.5
|
"semanticRatio": 0.5,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -161,7 +167,8 @@
|
|||||||
"q": "shrek",
|
"q": "shrek",
|
||||||
"limit": 100,
|
"limit": 100,
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"semanticRatio": 0.1
|
"semanticRatio": 0.1,
|
||||||
|
"embedder": "default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user