Compare commits

..

9 Commits

Author SHA1 Message Date
meili-bors[bot]
4e1ac9b0b4
Merge #5051
Some checks failed
Test suite / Tests on ${{ matrix.os }} (macos-12) (push) Waiting to run
Test suite / Tests almost all features (push) Has been skipped
Test suite / Test disabled tokenization (push) Has been skipped
Test suite / Tests on ubuntu-20.04 (push) Failing after 28s
Test suite / Run tests in debug (push) Failing after 31s
Test suite / Run Rustfmt (push) Successful in 2m6s
Test suite / Tests on ${{ matrix.os }} (windows-2022) (push) Failing after 8m36s
Test suite / Run Clippy (push) Failing after 21m21s
5051: Add timeout on read and write operations. r=irevoire a=dureuill

# Pull Request

## Related issue
Addresses #5054 

## What does this PR do?
- Add a timeout for read and write operations in the REST embedder. This might address some issues about tasks that get "stuck" while embedding documents.


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-11-14 08:34:43 +00:00
meili-bors[bot]
8a18e37a3d
Merge #5055
5055: Update version for the next release (v1.11.2) in Cargo.toml r=dureuill a=meili-bot

⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging.

Co-authored-by: dureuill <dureuill@users.noreply.github.com>
2024-11-14 07:53:51 +00:00
dureuill
36375ea326 Update version for the next release (v1.11.2) in Cargo.toml 2024-11-13 16:24:23 +00:00
Louis Dureuil
bca2974266
Add timeout on read and write operations. 2024-11-13 17:01:23 +01:00
meili-bors[bot]
13025594a8
Merge #5041
5041: Update version for the next release (v1.11.1) in Cargo.toml r=dureuill a=meili-bot

⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging.

Co-authored-by: dureuill <dureuill@users.noreply.github.com>
2024-11-06 11:35:26 +00:00
meili-bors[bot]
2c1c33166d
Merge #5039
5039: Add 3s timeout to embedding requests made during search r=irevoire a=dureuill

# Pull Request

## Related issue
Fixes #5032 

## What does this PR do?
- Add a 3-second timeout to embedding requests against a remote embedder made in the context of search. The timeout triggers when there are failing requests due to rate-limiting.
- Add a test of that timeout.

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-11-06 10:56:50 +00:00
dureuill
cdb6e3f45a Update version for the next release (v1.11.1) in Cargo.toml 2024-11-06 08:35:51 +00:00
Louis Dureuil
1d574bd443
Add test 2024-11-06 09:25:41 +01:00
Louis Dureuil
37a4fd7f99
Add deadline of 3 seconds to embedding requests made in the context of hybrid search 2024-11-06 09:25:24 +01:00
1088 changed files with 1459 additions and 1947 deletions

View File

@ -43,7 +43,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -88,7 +88,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -41,7 +41,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -40,7 +40,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -40,7 +40,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -40,7 +40,7 @@ jobs:
# Run benchmarks # Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: | run: |
cd crates/benchmarks cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files # Generate critcmp files

View File

@ -21,10 +21,10 @@ jobs:
- name: Install cargo-flaky - name: Install cargo-flaky
run: cargo install cargo-flaky run: cargo install cargo-flaky
- name: Run cargo flaky in the dumps - name: Run cargo flaky in the dumps
run: cd crates/dump; cargo flaky -i 100 --release run: cd dump; cargo flaky -i 100 --release
- name: Run cargo flaky in the index-scheduler - name: Run cargo flaky in the index-scheduler
run: cd crates/index-scheduler; cargo flaky -i 100 --release run: cd index-scheduler; cargo flaky -i 100 --release
- name: Run cargo flaky in the auth - name: Run cargo flaky in the auth
run: cd crates/meilisearch-auth; cargo flaky -i 100 --release run: cd meilisearch-auth; cargo flaky -i 100 --release
- name: Run cargo flaky in meilisearch - name: Run cargo flaky in meilisearch
run: cd crates/meilisearch; cargo flaky -i 100 --release run: cd meilisearch; cargo flaky -i 100 --release

View File

@ -65,9 +65,9 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [macos-13, windows-2022] os: [macos-12, windows-2022]
include: include:
- os: macos-13 - os: macos-12
artifact_name: meilisearch artifact_name: meilisearch
asset_name: meilisearch-macos-amd64 asset_name: meilisearch-macos-amd64
- os: windows-2022 - os: windows-2022
@ -90,7 +90,7 @@ jobs:
publish-macos-apple-silicon: publish-macos-apple-silicon:
name: Publish binary for macOS silicon name: Publish binary for macOS silicon
runs-on: macos-13 runs-on: macos-12
needs: check-version needs: check-version
strategy: strategy:
matrix: matrix:

View File

@ -33,7 +33,7 @@ jobs:
- name: Setup test with Rust stable - name: Setup test with Rust stable
uses: dtolnay/rust-toolchain@1.79 uses: dtolnay/rust-toolchain@1.79
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.5 uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo check without any default features - name: Run cargo check without any default features
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -51,11 +51,11 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [macos-13, windows-2022] os: [macos-12, windows-2022]
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.5 uses: Swatinem/rust-cache@v2.7.1
- uses: dtolnay/rust-toolchain@1.79 - uses: dtolnay/rust-toolchain@1.79
- name: Run cargo check without any default features - name: Run cargo check without any default features
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
@ -127,7 +127,7 @@ jobs:
apt-get install build-essential -y apt-get install build-essential -y
- uses: dtolnay/rust-toolchain@1.79 - uses: dtolnay/rust-toolchain@1.79
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.5 uses: Swatinem/rust-cache@v2.7.1
- name: Run tests in debug - name: Run tests in debug
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -144,7 +144,7 @@ jobs:
profile: minimal profile: minimal
components: clippy components: clippy
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.5 uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo clippy - name: Run cargo clippy
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -163,11 +163,11 @@ jobs:
override: true override: true
components: rustfmt components: rustfmt
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.5 uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo fmt - name: Run cargo fmt
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
# we are going to create an empty file where rustfmt expects it. # we are going to create an empty file where rustfmt expects it.
run: | run: |
echo -ne "\n" > crates/benchmarks/benches/datasets_paths.rs echo -ne "\n" > benchmarks/benches/datasets_paths.rs
cargo fmt --all -- --check cargo fmt --all -- --check

3
.gitignore vendored
View File

@ -5,6 +5,7 @@
**/*.json_lines **/*.json_lines
**/*.rs.bk **/*.rs.bk
/*.mdb /*.mdb
/query-history.txt
/data.ms /data.ms
/snapshots /snapshots
/dumps /dumps
@ -18,4 +19,4 @@
*.snap.new *.snap.new
# Fuzzcheck data for the facet indexing fuzz test # Fuzzcheck data for the facet indexing fuzz test
crates/milli/fuzz/update::facet::incremental::fuzz::fuzz/ milli/fuzz/update::facet::incremental::fuzz::fuzz/

62
Cargo.lock generated
View File

@ -404,25 +404,6 @@ dependencies = [
"thiserror", "thiserror",
] ]
[[package]]
name = "arroy"
version = "0.5.0"
source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7"
dependencies = [
"bytemuck",
"byteorder",
"heed",
"log",
"memmap2",
"nohash",
"ordered-float",
"rand",
"rayon",
"roaring",
"tempfile",
"thiserror",
]
[[package]] [[package]]
name = "assert-json-diff" name = "assert-json-diff"
version = "2.0.2" version = "2.0.2"
@ -491,7 +472,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]] [[package]]
name = "benchmarks" name = "benchmarks"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bytes", "bytes",
@ -672,7 +653,7 @@ dependencies = [
[[package]] [[package]]
name = "build-info" name = "build-info"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"time", "time",
@ -726,9 +707,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]] [[package]]
name = "bytemuck" name = "bytemuck"
version = "1.19.0" version = "1.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
dependencies = [ dependencies = [
"bytemuck_derive", "bytemuck_derive",
] ]
@ -1642,7 +1623,7 @@ dependencies = [
[[package]] [[package]]
name = "dump" name = "dump"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"big_s", "big_s",
@ -1854,7 +1835,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]] [[package]]
name = "file-store" name = "file-store"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"tempfile", "tempfile",
"thiserror", "thiserror",
@ -1876,7 +1857,7 @@ dependencies = [
[[package]] [[package]]
name = "filter-parser" name = "filter-parser"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"insta", "insta",
"nom", "nom",
@ -1896,7 +1877,7 @@ dependencies = [
[[package]] [[package]]
name = "flatten-serde-json" name = "flatten-serde-json"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"criterion", "criterion",
"serde_json", "serde_json",
@ -2020,7 +2001,7 @@ dependencies = [
[[package]] [[package]]
name = "fuzzers" name = "fuzzers"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"arbitrary", "arbitrary",
"clap", "clap",
@ -2572,10 +2553,10 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
[[package]] [[package]]
name = "index-scheduler" name = "index-scheduler"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "arroy",
"big_s", "big_s",
"bincode", "bincode",
"crossbeam", "crossbeam",
@ -2766,7 +2747,7 @@ dependencies = [
[[package]] [[package]]
name = "json-depth-checker" name = "json-depth-checker"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"criterion", "criterion",
"serde_json", "serde_json",
@ -3385,7 +3366,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "meili-snap" name = "meili-snap"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"insta", "insta",
"md5", "md5",
@ -3394,7 +3375,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch" name = "meilisearch"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"actix-cors", "actix-cors",
"actix-http", "actix-http",
@ -3484,7 +3465,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch-auth" name = "meilisearch-auth"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"enum-iterator", "enum-iterator",
@ -3503,7 +3484,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch-types" name = "meilisearch-types"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"anyhow", "anyhow",
@ -3533,10 +3514,9 @@ dependencies = [
[[package]] [[package]]
name = "meilitool" name = "meilitool"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)",
"clap", "clap",
"dump", "dump",
"file-store", "file-store",
@ -3565,9 +3545,9 @@ dependencies = [
[[package]] [[package]]
name = "milli" name = "milli"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "arroy",
"big_s", "big_s",
"bimap", "bimap",
"bincode", "bincode",
@ -4011,7 +3991,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]] [[package]]
name = "permissive-json-pointer" name = "permissive-json-pointer"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"big_s", "big_s",
"serde_json", "serde_json",
@ -6400,7 +6380,7 @@ dependencies = [
[[package]] [[package]]
name = "xtask" name = "xtask"
version = "1.11.0" version = "1.11.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"build-info", "build-info",

View File

@ -1,28 +1,28 @@
[workspace] [workspace]
resolver = "2" resolver = "2"
members = [ members = [
"crates/meilisearch", "meilisearch",
"crates/meilitool", "meilitool",
"crates/meilisearch-types", "meilisearch-types",
"crates/meilisearch-auth", "meilisearch-auth",
"crates/meili-snap", "meili-snap",
"crates/index-scheduler", "index-scheduler",
"crates/dump", "dump",
"crates/file-store", "file-store",
"crates/permissive-json-pointer", "permissive-json-pointer",
"crates/milli", "milli",
"crates/filter-parser", "filter-parser",
"crates/flatten-serde-json", "flatten-serde-json",
"crates/json-depth-checker", "json-depth-checker",
"crates/benchmarks", "benchmarks",
"crates/fuzzers", "fuzzers",
"crates/tracing-trace", "tracing-trace",
"crates/xtask", "xtask",
"crates/build-info", "build-info",
] ]
[workspace.package] [workspace.package]
version = "1.11.0" version = "1.11.2"
authors = [ authors = [
"Quentin de Quelen <quentin@dequelen.me>", "Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>", "Clément Renault <clement@meilisearch.com>",

View File

@ -21,7 +21,6 @@ RUN set -eux; \
# Run # Run
FROM alpine:3.20 FROM alpine:3.20
LABEL org.opencontainers.image.source="https://github.com/meilisearch/meilisearch"
ENV MEILI_HTTP_ADDR 0.0.0.0:7700 ENV MEILI_HTTP_ADDR 0.0.0.0:7700
ENV MEILI_SERVER_PROVIDER docker ENV MEILI_SERVER_PROVIDER docker

View File

@ -1,9 +1,6 @@
<p align="center"> <p align="center">
<a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo#gh-light-mode-only" target="_blank"> <a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo" target="_blank">
<img src="assets/meilisearch-logo-light.svg?sanitize=true#gh-light-mode-only"> <img src="assets/meilisearch-logo-kawaii.png">
</a>
<a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo#gh-dark-mode-only" target="_blank">
<img src="assets/meilisearch-logo-dark.svg?sanitize=true#gh-dark-mode-only">
</a> </a>
</p> </p>

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

View File

@ -1,6 +1,6 @@
status = [ status = [
'Tests on ubuntu-20.04', 'Tests on ubuntu-20.04',
'Tests on macos-13', 'Tests on macos-12',
'Tests on windows-2022', 'Tests on windows-2022',
'Run Clippy', 'Run Clippy',
'Run Rustfmt', 'Run Rustfmt',

View File

@ -1,109 +0,0 @@
use std::any::Any;
use std::sync::Arc;
use actix_web::HttpRequest;
use meilisearch_types::InstanceUid;
use serde_json::Value;
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
use crate::Opt;
pub struct MockAnalytics {
instance_uid: Option<InstanceUid>,
}
#[derive(Default)]
pub struct SearchAggregator;
#[allow(dead_code)]
impl SearchAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self
}
pub fn succeed(&mut self, _: &dyn Any) {}
}
#[derive(Default)]
pub struct SimilarAggregator;
#[allow(dead_code)]
impl SimilarAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self
}
pub fn succeed(&mut self, _: &dyn Any) {}
}
#[derive(Default)]
pub struct MultiSearchAggregator;
#[allow(dead_code)]
impl MultiSearchAggregator {
pub fn from_federated_search(_: &dyn Any, _: &dyn Any) -> Self {
Self
}
pub fn succeed(&mut self) {}
}
#[derive(Default)]
pub struct FacetSearchAggregator;
#[allow(dead_code)]
impl FacetSearchAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self
}
pub fn succeed(&mut self, _: &dyn Any) {}
}
impl MockAnalytics {
#[allow(clippy::new_ret_no_self)]
pub fn new(opt: &Opt) -> Arc<dyn Analytics> {
let instance_uid = find_user_id(&opt.db_path);
Arc::new(Self { instance_uid })
}
}
impl Analytics for MockAnalytics {
fn instance_uid(&self) -> Option<&meilisearch_types::InstanceUid> {
self.instance_uid.as_ref()
}
// These methods are noop and should be optimized out
fn publish(&self, _event_name: String, _send: Value, _request: Option<&HttpRequest>) {}
fn get_search(&self, _aggregate: super::SearchAggregator) {}
fn post_search(&self, _aggregate: super::SearchAggregator) {}
fn get_similar(&self, _aggregate: super::SimilarAggregator) {}
fn post_similar(&self, _aggregate: super::SimilarAggregator) {}
fn post_multi_search(&self, _aggregate: super::MultiSearchAggregator) {}
fn post_facet_search(&self, _aggregate: super::FacetSearchAggregator) {}
fn add_documents(
&self,
_documents_query: &UpdateDocumentsQuery,
_index_creation: bool,
_request: &HttpRequest,
) {
}
fn delete_documents(&self, _kind: DocumentDeletionKind, _request: &HttpRequest) {}
fn update_documents(
&self,
_documents_query: &UpdateDocumentsQuery,
_index_creation: bool,
_request: &HttpRequest,
) {
}
fn update_documents_by_function(
&self,
_documents_query: &DocumentEditionByFunction,
_index_creation: bool,
_request: &HttpRequest,
) {
}
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
}

View File

@ -1,334 +0,0 @@
use std::fs::{read_dir, read_to_string, remove_file, File};
use std::io::BufWriter;
use std::path::PathBuf;
use anyhow::Context;
use clap::{Parser, Subcommand};
use dump::{DumpWriter, IndexMetadata};
use file_store::FileStore;
use meilisearch_auth::AuthController;
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::{obkv_to_json, BEU32};
use meilisearch_types::tasks::{Status, Task};
use meilisearch_types::versioning::{get_version, parse_version};
use meilisearch_types::Index;
use time::macros::format_description;
use time::OffsetDateTime;
use upgrade::OfflineUpgrade;
use uuid_codec::UuidCodec;
mod upgrade;
mod uuid_codec;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Cli {
/// The database path where the Meilisearch is running.
#[arg(long, default_value = "data.ms/")]
db_path: PathBuf,
#[command(subcommand)]
command: Command,
}
#[derive(Subcommand)]
enum Command {
/// Clears the task queue and make it empty.
///
/// This command can be safely executed even if Meilisearch is running and processing tasks.
/// Once the task queue is empty you can restart Meilisearch and no more tasks must be visible,
/// even the ones that were processing. However, it's highly possible that you see the processing
/// tasks in the queue again with an associated internal error message.
ClearTaskQueue,
/// Exports a dump from the Meilisearch database.
///
/// Make sure to run this command when Meilisearch is not running or running but not processing tasks.
/// If tasks are being processed while a dump is being exported there are chances for the dump to be
/// malformed with missing tasks.
///
/// TODO Verify this claim or make sure it cannot happen and we can export dumps
/// without caring about killing Meilisearch first!
ExportADump {
/// The directory in which the dump will be created.
#[arg(long, default_value = "dumps/")]
dump_dir: PathBuf,
/// Skip dumping the enqueued or processing tasks.
///
/// Can be useful when there are a lot of them and it is not particularly useful
/// to keep them. Note that only the enqueued tasks takes up space so skipping
/// the processed ones is not particularly interesting.
#[arg(long)]
skip_enqueued_tasks: bool,
},
/// Attempts to upgrade from one major version to the next without a dump.
///
/// Make sure to run this commmand when Meilisearch is not running!
/// If Meilisearch is running while executing this command, the database could be corrupted
/// (contain data from both the old and the new versions)
///
/// Supported upgrade paths:
///
/// - v1.9.x -> v1.10.x -> v1.11.x
OfflineUpgrade {
#[arg(long)]
target_version: String,
},
}
fn main() -> anyhow::Result<()> {
let Cli { db_path, command } = Cli::parse();
let detected_version = get_version(&db_path).context("While checking the version file")?;
match command {
Command::ClearTaskQueue => clear_task_queue(db_path),
Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
}
Command::OfflineUpgrade { target_version } => {
let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?;
OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade()
}
}
}
/// Clears the task queue located at `db_path`.
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
let path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) }
.with_context(|| format!("While trying to open {:?}", path.display()))?;
eprintln!("Deleting tasks from the database...");
let mut wtxn = env.write_txn()?;
let all_tasks = try_opening_poly_database(&env, &wtxn, "all-tasks")?;
let total = all_tasks.len(&wtxn)?;
let status = try_opening_poly_database(&env, &wtxn, "status")?;
let kind = try_opening_poly_database(&env, &wtxn, "kind")?;
let index_tasks = try_opening_poly_database(&env, &wtxn, "index-tasks")?;
let canceled_by = try_opening_poly_database(&env, &wtxn, "canceled_by")?;
let enqueued_at = try_opening_poly_database(&env, &wtxn, "enqueued-at")?;
let started_at = try_opening_poly_database(&env, &wtxn, "started-at")?;
let finished_at = try_opening_poly_database(&env, &wtxn, "finished-at")?;
try_clearing_poly_database(&mut wtxn, all_tasks, "all-tasks")?;
try_clearing_poly_database(&mut wtxn, status, "status")?;
try_clearing_poly_database(&mut wtxn, kind, "kind")?;
try_clearing_poly_database(&mut wtxn, index_tasks, "index-tasks")?;
try_clearing_poly_database(&mut wtxn, canceled_by, "canceled_by")?;
try_clearing_poly_database(&mut wtxn, enqueued_at, "enqueued-at")?;
try_clearing_poly_database(&mut wtxn, started_at, "started-at")?;
try_clearing_poly_database(&mut wtxn, finished_at, "finished-at")?;
wtxn.commit().context("While committing the transaction")?;
eprintln!("Successfully deleted {total} tasks from the tasks database!");
eprintln!("Deleting the content files from disk...");
let mut count = 0usize;
let update_files = db_path.join("update_files");
let entries = read_dir(&update_files).with_context(|| {
format!("While trying to read the content of {:?}", update_files.display())
})?;
for result in entries {
match result {
Ok(ent) => match remove_file(ent.path()) {
Ok(_) => count += 1,
Err(e) => eprintln!("Error while deleting {:?}: {}", ent.path().display(), e),
},
Err(e) => {
eprintln!("Error while reading a file in {:?}: {}", update_files.display(), e)
}
}
}
eprintln!("Successfully deleted {count} content files from disk!");
Ok(())
}
fn try_opening_database<KC: 'static, DC: 'static>(
env: &Env,
rtxn: &RoTxn,
db_name: &str,
) -> anyhow::Result<Database<KC, DC>> {
env.open_database(rtxn, Some(db_name))
.with_context(|| format!("While opening the {db_name:?} database"))?
.with_context(|| format!("Missing the {db_name:?} database"))
}
fn try_opening_poly_database(
env: &Env,
rtxn: &RoTxn,
db_name: &str,
) -> anyhow::Result<Database<Unspecified, Unspecified>> {
env.database_options()
.name(db_name)
.open(rtxn)
.with_context(|| format!("While opening the {db_name:?} poly database"))?
.with_context(|| format!("Missing the {db_name:?} poly database"))
}
fn try_clearing_poly_database(
wtxn: &mut RwTxn,
database: Database<Unspecified, Unspecified>,
db_name: &str,
) -> anyhow::Result<()> {
database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
}
/// Exports a dump into the dump directory.
fn export_a_dump(
db_path: PathBuf,
dump_dir: PathBuf,
skip_enqueued_tasks: bool,
) -> Result<(), anyhow::Error> {
let started_at = OffsetDateTime::now_utc();
// 1. Extracts the instance UID from disk
let instance_uid_path = db_path.join("instance-uid");
let instance_uid = match read_to_string(&instance_uid_path) {
Ok(content) => match content.trim().parse() {
Ok(uuid) => Some(uuid),
Err(e) => {
eprintln!("Impossible to parse instance-uid: {e}");
None
}
},
Err(e) => {
eprintln!("Impossible to read {}: {}", instance_uid_path.display(), e);
None
}
};
let dump = DumpWriter::new(instance_uid).context("While creating a new dump")?;
let file_store =
FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
let index_scheduler_path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
eprintln!("Dumping the keys...");
// 2. dump the keys
let auth_store = AuthController::new(&db_path, &None)
.with_context(|| format!("While opening the auth store at {}", db_path.display()))?;
let mut dump_keys = dump.create_keys()?;
let mut count = 0;
for key in auth_store.list_keys()? {
dump_keys.push_key(&key)?;
count += 1;
}
dump_keys.flush()?;
eprintln!("Successfully dumped {count} keys!");
let rtxn = env.read_txn()?;
let all_tasks: Database<BEU32, SerdeJson<Task>> =
try_opening_database(&env, &rtxn, "all-tasks")?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &rtxn, "index-mapping")?;
if skip_enqueued_tasks {
eprintln!("Skip dumping the enqueued tasks...");
} else {
eprintln!("Dumping the enqueued tasks...");
// 3. dump the tasks
let mut dump_tasks = dump.create_tasks_queue()?;
let mut count = 0;
for ret in all_tasks.iter(&rtxn)? {
let (_, t) = ret?;
let status = t.status;
let content_file = t.content_uuid();
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
if let Some(content_file_uuid) = content_file {
if status == Status::Enqueued {
let content_file = file_store.get_update(content_file_uuid)?;
let reader =
DocumentsBatchReader::from_reader(content_file).with_context(|| {
format!("While reading content file {:?}", content_file_uuid)
})?;
let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
while let Some(doc) = cursor.next_document().with_context(|| {
format!("While iterating on content file {:?}", content_file_uuid)
})? {
dump_content_file
.push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
}
dump_content_file.flush()?;
count += 1;
}
}
}
dump_tasks.flush()?;
eprintln!("Successfully dumped {count} enqueued tasks!");
}
eprintln!("Dumping the indexes...");
// 4. Dump the indexes
let mut count = 0;
for result in index_mapping.iter(&rtxn)? {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| {
format!("While trying to open the index at path {:?}", index_path.display())
})?;
let rtxn = index.read_txn()?;
let metadata = IndexMetadata {
uid: uid.to_owned(),
primary_key: index.primary_key(&rtxn)?.map(String::from),
created_at: index.created_at(&rtxn)?,
updated_at: index.updated_at(&rtxn)?,
};
let mut index_dumper = dump.create_index(uid, &metadata)?;
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
// 4.1. Dump the documents
for ret in index.all_documents(&rtxn)? {
let (_id, doc) = ret?;
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
index_dumper.push_document(&document)?;
}
// 4.2. Dump the settings
let settings = meilisearch_types::settings::settings(
&index,
&rtxn,
meilisearch_types::settings::SecretPolicy::RevealSecrets,
)?;
index_dumper.settings(&settings)?;
count += 1;
}
eprintln!("Successfully dumped {count} indexes!");
// We will not dump experimental feature settings
eprintln!("The tool is not dumping experimental features, please set them by hand afterward");
let dump_uid = started_at.format(format_description!(
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
)).unwrap();
let path = dump_dir.join(format!("{}.dump", dump_uid));
let file = File::create(&path)?;
dump.persist_to(BufWriter::new(file))?;
eprintln!("Dump exported at path {:?}", path.display());
Ok(())
}

View File

@ -1,73 +0,0 @@
mod v1_10;
mod v1_11;
mod v1_9;
use std::path::{Path, PathBuf};
use anyhow::{bail, Context};
use meilisearch_types::versioning::create_version_file;
use v1_10::v1_9_to_v1_10;
use crate::upgrade::v1_11::v1_10_to_v1_11;
pub struct OfflineUpgrade {
pub db_path: PathBuf,
pub current_version: (String, String, String),
pub target_version: (String, String, String),
}
impl OfflineUpgrade {
pub fn upgrade(self) -> anyhow::Result<()> {
let upgrade_list = [
(v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"),
(v1_10_to_v1_11, "1", "11", "0"),
];
let (current_major, current_minor, current_patch) = &self.current_version;
let start_at = match (
current_major.as_str(),
current_minor.as_str(),
current_patch.as_str(),
) {
("1", "9", _) => 0,
("1", "10", _) => 1,
_ => {
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
}
};
let (target_major, target_minor, target_patch) = &self.target_version;
let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
("1", "10", _) => 0,
("1", "11", _) => 1,
(major, _, _) if major.starts_with('v') => {
bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
}
_ => {
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11")
}
};
println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
#[allow(clippy::needless_range_loop)]
for index in start_at..=ends_at {
let (func, major, minor, patch) = upgrade_list[index];
(func)(&self.db_path)?;
println!("Done");
// We're writing the version file just in case an issue arise _while_ upgrading.
// We don't want the DB to fail in an unknown state.
println!("Writing VERSION file");
create_version_file(&self.db_path, major, minor, patch)
.context("while writing VERSION file after the upgrade")?;
}
println!("Success");
Ok(())
}
}

View File

@ -1,289 +0,0 @@
use anyhow::bail;
use std::path::Path;
use anyhow::Context;
use meilisearch_types::{
heed::{
types::{SerdeJson, Str},
Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
},
milli::index::{db_name, main_key},
};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
use super::v1_9;
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
#[serde(with = "time::serde::rfc3339")]
pub created_at: time::OffsetDateTime,
/// Date of the last update of the index.
#[serde(with = "time::serde::rfc3339")]
pub updated_at: time::OffsetDateTime,
}
impl From<v1_9::IndexStats> for IndexStats {
fn from(
v1_9::IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at,
updated_at,
}: v1_9::IndexStats,
) -> Self {
IndexStats {
number_of_documents,
database_size,
used_database_size,
field_distribution,
created_at: created_at.0,
updated_at: updated_at.0,
}
}
}
#[derive(serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
fn update_index_stats(
index_stats: Database<UuidCodec, Unspecified>,
index_uid: &str,
index_uuid: uuid::Uuid,
sched_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let ctx = || format!("while updating index stats for index `{index_uid}`");
let stats: Option<&str> = index_stats
.remap_data_type::<Str>()
.get(sched_wtxn, &index_uuid)
.with_context(ctx)
.with_context(|| "While reading value")?;
dbg!(stats);
let stats: Option<v1_9::IndexStats> = index_stats
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
.get(sched_wtxn, &index_uuid)
.with_context(ctx)
.with_context(|| "While reading value")?;
if let Some(stats) = stats {
let stats: self::IndexStats = stats.into();
index_stats
.remap_data_type::<SerdeJson<self::IndexStats>>()
.put(sched_wtxn, &index_uuid, &stats)
.with_context(ctx)
.with_context(|| "While writing value")?;
}
Ok(())
}
fn update_date_format(
index_uid: &str,
index_env: &Env,
index_wtxn: &mut RwTxn,
) -> anyhow::Result<()> {
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
Ok(())
}
fn find_rest_embedders(
index_uid: &str,
index_env: &Env,
index_txn: &RoTxn,
) -> anyhow::Result<Vec<String>> {
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
let mut rest_embedders = vec![];
for config in main
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()
{
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
rest_embedders.push(config.name);
}
}
Ok(rest_embedders)
}
fn date_round_trip(
wtxn: &mut RwTxn,
index_uid: &str,
db: Database<Unspecified, Unspecified>,
key: &str,
) -> anyhow::Result<()> {
let datetime =
db.remap_types::<Str, SerdeJson<v1_9::LegacyDateTime>>().get(wtxn, key).with_context(
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
)?;
if let Some(datetime) = datetime {
db.remap_types::<Str, SerdeJson<self::OffsetDateTime>>()
.put(wtxn, key, &self::OffsetDateTime(datetime.0))
.with_context(|| {
format!(
"could not write `{key}` while updating date format for index `{index_uid}`"
)
})?;
}
Ok(())
}
pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.9.0 to v1.10.0");
// 2 changes here
// 1. date format. needs to be done before opening the Index
// 2. REST embedders. We don't support this case right now, so bail
let index_scheduler_path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
let mut sched_wtxn = env.write_txn()?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
let index_stats: Database<UuidCodec, Unspecified> =
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
format!("While trying to open {:?}", index_scheduler_path.display())
})?;
let index_count =
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
// 1. immutably for the iteration
// 2. mutably for updating index stats
let indexes: Vec<_> = index_mapping
.iter(&sched_wtxn)?
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
.collect();
let mut rest_embedders = Vec::new();
let mut unwrapped_indexes = Vec::new();
// check that update can take place
for (index_index, result) in indexes.into_iter().enumerate() {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let index_txn = index_env.read_txn().with_context(|| {
format!(
"while obtaining a write transaction for index {uid} at {}",
index_path.display()
)
})?;
println!("\t- Checking for incompatible embedders (REST embedders)");
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
if rest_embedders_for_index.is_empty() {
unwrapped_indexes.push((uid, uuid));
} else {
// no need to add to unwrapped indexes because we'll exit early
rest_embedders.push((uid, rest_embedders_for_index));
}
}
if !rest_embedders.is_empty() {
let rest_embedders = rest_embedders
.into_iter()
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
.collect::<Vec<_>>()
.join("\n");
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
The database has not been modified and is still a valid v1.9 database.");
}
println!("Update can take place, updating");
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Updating index `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
// FIXME: fetch the 25 magic number from the index file
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let mut index_wtxn = index_env.write_txn().with_context(|| {
format!(
"while obtaining a write transaction for index `{uid}` at `{}`",
index_path.display()
)
})?;
println!("\t- Updating index stats");
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
println!("\t- Updating date format");
update_date_format(&uid, &index_env, &mut index_wtxn)?;
index_wtxn.commit().with_context(|| {
format!("while committing the write txn for index `{uid}` at {}", index_path.display())
})?;
}
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
println!("Upgrading database succeeded");
Ok(())
}

View File

@ -1,85 +0,0 @@
//! The breaking changes that happened between the v1.10 and the v1.11 are:
//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0
//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata.
//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB.
//! - Store the list of updated IDs directly in LMDBinstead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything.
use std::path::Path;
use anyhow::Context;
use meilisearch_types::{
heed::{types::Str, Database, EnvOpenOptions},
milli::index::db_name,
};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.10.0 to v1.11.0");
let index_scheduler_path = db_path.join("tasks");
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
let sched_rtxn = env.read_txn()?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &sched_rtxn, "index-mapping")?;
let index_count =
index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?;
let indexes: Vec<_> = index_mapping
.iter(&sched_rtxn)?
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
.collect();
for (index_index, result) in indexes.into_iter().enumerate() {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
println!(
"[{}/{index_count}]Updating embeddings for `{uid}` at `{}`",
index_index + 1,
index_path.display()
);
let index_env = unsafe {
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
format!("while opening index {uid} at '{}'", index_path.display())
})?
};
let index_rtxn = index_env.read_txn().with_context(|| {
format!(
"while obtaining a read transaction for index {uid} at {}",
index_path.display()
)
})?;
let index_read_database =
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
let mut index_wtxn = index_env.write_txn().with_context(|| {
format!(
"while obtaining a write transaction for index {uid} at {}",
index_path.display()
)
})?;
let index_write_database =
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
arroy_v04_to_v05::ugrade_from_prev_version(
&index_rtxn,
index_read_database,
&mut index_wtxn,
index_write_database,
)?;
index_wtxn.commit()?;
}
Ok(())
}

View File

@ -1,158 +0,0 @@
use serde::{Deserialize, Serialize};
use time::{Date, OffsetDateTime, Time, UtcOffset};
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
/// The statistics that can be computed from an `Index` object.
#[derive(serde::Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
pub created_at: LegacyDateTime,
/// Date of the last update of the index.
pub updated_at: LegacyDateTime,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
}
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
pub struct EmbeddingConfig {
/// Options of the embedder, specific to each kind of embedder
pub embedder_options: EmbedderOptions,
}
/// Options of an embedder, specific to each kind of embedder.
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub enum EmbedderOptions {
HuggingFace(hf::EmbedderOptions),
OpenAi(openai::EmbedderOptions),
Ollama(ollama::EmbedderOptions),
UserProvided(manual::EmbedderOptions),
Rest(rest::EmbedderOptions),
}
impl Default for EmbedderOptions {
fn default() -> Self {
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
}
}
mod hf {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub model: String,
pub revision: Option<String>,
}
}
mod openai {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
}
}
mod ollama {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub embedding_model: String,
pub url: Option<String>,
pub api_key: Option<String>,
}
}
mod manual {
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
pub dimensions: usize,
}
}
mod rest {
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
pub struct EmbedderOptions {
pub api_key: Option<String>,
pub dimensions: Option<usize>,
pub url: String,
pub input_field: Vec<String>,
// path to the array of embeddings
pub path_to_embeddings: Vec<String>,
// shape of a single embedding
pub embedding_object: Vec<String>,
}
}
/// A datetime from Meilisearch v1.9 with an unspecified format.
#[derive(Debug)]
pub struct LegacyDateTime(pub OffsetDateTime);
impl<'de> Deserialize<'de> for LegacyDateTime {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = OffsetDateTime;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(formatter, "a valid datetime")
}
// Comes from a binary. The legacy format is:
// 2024-11-04 13:32:08.48368 +00:00:00
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]");
OffsetDateTime::parse(v, format).map_err(E::custom)
}
// Comes from the docker image, the legacy format is:
// [2024, 309, 17, 15, 1, 698184971, 0,0,0]
// year, day in year, hour, minute, sec, subsec , offset stuff
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let mut vec = Vec::new();
// We must deserialize the value as `i64` because the largest values are `u32` and `i32`
while let Some(el) = seq.next_element::<i64>()? {
vec.push(el);
}
if vec.len() != 9 {
return Err(serde::de::Error::custom(format!(
"Invalid datetime, received an array of {} elements instead of 9",
vec.len()
)));
}
Ok(OffsetDateTime::new_in_offset(
Date::from_ordinal_date(vec[0] as i32, vec[1] as u16)
.map_err(serde::de::Error::custom)?,
Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32)
.map_err(serde::de::Error::custom)?,
UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8)
.map_err(serde::de::Error::custom)?,
))
}
}
deserializer.deserialize_any(Visitor).map(LegacyDateTime)
}
}

View File

@ -1,139 +0,0 @@
use super::matching_words::WordId;
use super::{Match, MatchPosition};
struct MatchIntervalWithScore {
interval: [usize; 2],
score: [i16; 3],
}
// count score for phrases
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
let words_in_phrase_minus_one = (lwp - fwp) as i16;
// will always be ordered, so +1 for each space between words
*order_score += words_in_phrase_minus_one;
// distance will always be 1, so -1 for each space between words
*distance_score -= words_in_phrase_minus_one;
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
let m_last_word_pos = match m.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
lwp
}
};
let next_match_first_word_pos = next_match.get_first_word_pos();
// compute distance between matches
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
// in case last match is a phrase, count score for its words
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
}
ids.extend(m.ids.iter());
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
[uniq_score, distance_score, order_score]
}
/// Returns the first and last match where the score computed by match_interval_score is the best.
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
if matches.is_empty() {
panic!("`matches` should not be empty at this point");
}
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval: Option<MatchIntervalWithScore> = None;
let mut save_best_interval = |interval_first, interval_last| {
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
let is_interval_score_better = &best_interval
.as_ref()
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
if *is_interval_score_better {
best_interval = Some(MatchIntervalWithScore {
interval: [interval_first, interval_last],
score: interval_score,
});
}
};
// we compute the matches interval if we have at least 2 matches.
// current interval positions.
let mut interval_first = 0;
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
for (index, next_match) in matches.iter().enumerate() {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
// if index is 0 there is no last viable match
if index != 0 {
let interval_last = index - 1;
// keep interval if it's the best
save_best_interval(interval_first, interval_last);
}
// advance start of the interval while interval is longer than crop_size.
loop {
interval_first += 1;
if interval_first == matches.len() {
interval_first -= 1;
break;
}
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
if interval_first_match_first_word_pos > next_match_last_word_pos
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
{
break;
}
}
}
}
// compute the last interval score and compare it to the best one.
let interval_last = matches.len() - 1;
// if it's the last match with itself, we need to make sure it's
// not a phrase longer than the crop window
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
save_best_interval(interval_first, interval_last);
}
// if none of the matches fit the criteria above, default to the first one
best_interval.map_or(
[&matches[0], &matches[0]],
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
)
}

View File

@ -1,62 +0,0 @@
use super::matching_words::WordId;
#[derive(Clone, Debug)]
pub enum MatchPosition {
Word {
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
},
Phrase {
// position of the first and last word in the phrase in the whole text.
word_positions: [usize; 2],
// position of the first and last token in the phrase in the whole text.
token_positions: [usize; 2],
},
}
#[derive(Clone, Debug)]
pub struct Match {
pub char_count: usize,
// ids of the query words that matches.
pub ids: Vec<WordId>,
pub position: MatchPosition,
}
impl Match {
pub(super) fn get_first_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
}
}
pub(super) fn get_last_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
}
}
pub(super) fn get_first_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
}
}
pub(super) fn get_last_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
}
}
pub(super) fn get_word_count(&self) -> usize {
match self.position {
MatchPosition::Word { .. } => 1,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
}
}
}

View File

@ -1,15 +0,0 @@
use charabia::{SeparatorKind, Token, TokenKind};
pub enum SimpleTokenKind {
Separator(SeparatorKind),
NotSeparator,
}
impl SimpleTokenKind {
pub fn new(token: &&Token<'_>) -> Self {
match token.kind {
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
_ => Self::NotSeparator,
}
}
}

Some files were not shown because too many files have changed in this diff Show More