mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Compare commits
9 Commits
94fb55bb6f
...
4e1ac9b0b4
Author | SHA1 | Date | |
---|---|---|---|
|
4e1ac9b0b4 | ||
|
8a18e37a3d | ||
|
36375ea326 | ||
|
bca2974266 | ||
|
13025594a8 | ||
|
2c1c33166d | ||
|
cdb6e3f45a | ||
|
1d574bd443 | ||
|
37a4fd7f99 |
2
.github/workflows/benchmarks-manual.yml
vendored
2
.github/workflows/benchmarks-manual.yml
vendored
@ -43,7 +43,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
2
.github/workflows/benchmarks-pr.yml
vendored
2
.github/workflows/benchmarks-pr.yml
vendored
@ -88,7 +88,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
@ -41,7 +41,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
@ -40,7 +40,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
@ -40,7 +40,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
@ -40,7 +40,7 @@ jobs:
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd crates/benchmarks
|
||||
cd benchmarks
|
||||
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
|
8
.github/workflows/flaky-tests.yml
vendored
8
.github/workflows/flaky-tests.yml
vendored
@ -21,10 +21,10 @@ jobs:
|
||||
- name: Install cargo-flaky
|
||||
run: cargo install cargo-flaky
|
||||
- name: Run cargo flaky in the dumps
|
||||
run: cd crates/dump; cargo flaky -i 100 --release
|
||||
run: cd dump; cargo flaky -i 100 --release
|
||||
- name: Run cargo flaky in the index-scheduler
|
||||
run: cd crates/index-scheduler; cargo flaky -i 100 --release
|
||||
run: cd index-scheduler; cargo flaky -i 100 --release
|
||||
- name: Run cargo flaky in the auth
|
||||
run: cd crates/meilisearch-auth; cargo flaky -i 100 --release
|
||||
run: cd meilisearch-auth; cargo flaky -i 100 --release
|
||||
- name: Run cargo flaky in meilisearch
|
||||
run: cd crates/meilisearch; cargo flaky -i 100 --release
|
||||
run: cd meilisearch; cargo flaky -i 100 --release
|
||||
|
6
.github/workflows/publish-binaries.yml
vendored
6
.github/workflows/publish-binaries.yml
vendored
@ -65,9 +65,9 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-13, windows-2022]
|
||||
os: [macos-12, windows-2022]
|
||||
include:
|
||||
- os: macos-13
|
||||
- os: macos-12
|
||||
artifact_name: meilisearch
|
||||
asset_name: meilisearch-macos-amd64
|
||||
- os: windows-2022
|
||||
@ -90,7 +90,7 @@ jobs:
|
||||
|
||||
publish-macos-apple-silicon:
|
||||
name: Publish binary for macOS silicon
|
||||
runs-on: macos-13
|
||||
runs-on: macos-12
|
||||
needs: check-version
|
||||
strategy:
|
||||
matrix:
|
||||
|
14
.github/workflows/test-suite.yml
vendored
14
.github/workflows/test-suite.yml
vendored
@ -33,7 +33,7 @@ jobs:
|
||||
- name: Setup test with Rust stable
|
||||
uses: dtolnay/rust-toolchain@1.79
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.5
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -51,11 +51,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-13, windows-2022]
|
||||
os: [macos-12, windows-2022]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.5
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- uses: dtolnay/rust-toolchain@1.79
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
@ -127,7 +127,7 @@ jobs:
|
||||
apt-get install build-essential -y
|
||||
- uses: dtolnay/rust-toolchain@1.79
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.5
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run tests in debug
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -144,7 +144,7 @@ jobs:
|
||||
profile: minimal
|
||||
components: clippy
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.5
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo clippy
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -163,11 +163,11 @@ jobs:
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.5
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo fmt
|
||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||
# we are going to create an empty file where rustfmt expects it.
|
||||
run: |
|
||||
echo -ne "\n" > crates/benchmarks/benches/datasets_paths.rs
|
||||
echo -ne "\n" > benchmarks/benches/datasets_paths.rs
|
||||
cargo fmt --all -- --check
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -5,6 +5,7 @@
|
||||
**/*.json_lines
|
||||
**/*.rs.bk
|
||||
/*.mdb
|
||||
/query-history.txt
|
||||
/data.ms
|
||||
/snapshots
|
||||
/dumps
|
||||
@ -18,4 +19,4 @@
|
||||
*.snap.new
|
||||
|
||||
# Fuzzcheck data for the facet indexing fuzz test
|
||||
crates/milli/fuzz/update::facet::incremental::fuzz::fuzz/
|
||||
milli/fuzz/update::facet::incremental::fuzz::fuzz/
|
||||
|
62
Cargo.lock
generated
62
Cargo.lock
generated
@ -404,25 +404,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arroy"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"heed",
|
||||
"log",
|
||||
"memmap2",
|
||||
"nohash",
|
||||
"ordered-float",
|
||||
"rand",
|
||||
"rayon",
|
||||
"roaring",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assert-json-diff"
|
||||
version = "2.0.2"
|
||||
@ -491,7 +472,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "benchmarks"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@ -672,7 +653,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "build-info"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"time",
|
||||
@ -726,9 +707,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.19.0"
|
||||
version = "1.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
|
||||
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
|
||||
dependencies = [
|
||||
"bytemuck_derive",
|
||||
]
|
||||
@ -1642,7 +1623,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "dump"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@ -1854,7 +1835,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
||||
|
||||
[[package]]
|
||||
name = "file-store"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
@ -1876,7 +1857,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"nom",
|
||||
@ -1896,7 +1877,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@ -2020,7 +2001,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fuzzers"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"clap",
|
||||
@ -2572,10 +2553,10 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
|
||||
|
||||
[[package]]
|
||||
name = "index-scheduler"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"arroy",
|
||||
"big_s",
|
||||
"bincode",
|
||||
"crossbeam",
|
||||
@ -2766,7 +2747,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@ -3385,7 +3366,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "meili-snap"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"md5",
|
||||
@ -3394,7 +3375,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"actix-cors",
|
||||
"actix-http",
|
||||
@ -3484,7 +3465,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-auth"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"enum-iterator",
|
||||
@ -3503,7 +3484,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-types"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"actix-web",
|
||||
"anyhow",
|
||||
@ -3533,10 +3514,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilitool"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)",
|
||||
"clap",
|
||||
"dump",
|
||||
"file-store",
|
||||
@ -3565,9 +3545,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"arroy",
|
||||
"big_s",
|
||||
"bimap",
|
||||
"bincode",
|
||||
@ -4011,7 +3991,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "permissive-json-pointer"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"serde_json",
|
||||
@ -6400,7 +6380,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "xtask"
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"build-info",
|
||||
|
38
Cargo.toml
38
Cargo.toml
@ -1,28 +1,28 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"crates/meilisearch",
|
||||
"crates/meilitool",
|
||||
"crates/meilisearch-types",
|
||||
"crates/meilisearch-auth",
|
||||
"crates/meili-snap",
|
||||
"crates/index-scheduler",
|
||||
"crates/dump",
|
||||
"crates/file-store",
|
||||
"crates/permissive-json-pointer",
|
||||
"crates/milli",
|
||||
"crates/filter-parser",
|
||||
"crates/flatten-serde-json",
|
||||
"crates/json-depth-checker",
|
||||
"crates/benchmarks",
|
||||
"crates/fuzzers",
|
||||
"crates/tracing-trace",
|
||||
"crates/xtask",
|
||||
"crates/build-info",
|
||||
"meilisearch",
|
||||
"meilitool",
|
||||
"meilisearch-types",
|
||||
"meilisearch-auth",
|
||||
"meili-snap",
|
||||
"index-scheduler",
|
||||
"dump",
|
||||
"file-store",
|
||||
"permissive-json-pointer",
|
||||
"milli",
|
||||
"filter-parser",
|
||||
"flatten-serde-json",
|
||||
"json-depth-checker",
|
||||
"benchmarks",
|
||||
"fuzzers",
|
||||
"tracing-trace",
|
||||
"xtask",
|
||||
"build-info",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.11.0"
|
||||
version = "1.11.2"
|
||||
authors = [
|
||||
"Quentin de Quelen <quentin@dequelen.me>",
|
||||
"Clément Renault <clement@meilisearch.com>",
|
||||
|
@ -21,7 +21,6 @@ RUN set -eux; \
|
||||
|
||||
# Run
|
||||
FROM alpine:3.20
|
||||
LABEL org.opencontainers.image.source="https://github.com/meilisearch/meilisearch"
|
||||
|
||||
ENV MEILI_HTTP_ADDR 0.0.0.0:7700
|
||||
ENV MEILI_SERVER_PROVIDER docker
|
||||
|
@ -1,9 +1,6 @@
|
||||
<p align="center">
|
||||
<a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo#gh-light-mode-only" target="_blank">
|
||||
<img src="assets/meilisearch-logo-light.svg?sanitize=true#gh-light-mode-only">
|
||||
</a>
|
||||
<a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo#gh-dark-mode-only" target="_blank">
|
||||
<img src="assets/meilisearch-logo-dark.svg?sanitize=true#gh-dark-mode-only">
|
||||
<a href="https://www.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=logo" target="_blank">
|
||||
<img src="assets/meilisearch-logo-kawaii.png">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
|
BIN
assets/meilisearch-logo-kawaii.png
Normal file
BIN
assets/meilisearch-logo-kawaii.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 98 KiB |
@ -1,6 +1,6 @@
|
||||
status = [
|
||||
'Tests on ubuntu-20.04',
|
||||
'Tests on macos-13',
|
||||
'Tests on macos-12',
|
||||
'Tests on windows-2022',
|
||||
'Run Clippy',
|
||||
'Run Rustfmt',
|
||||
|
@ -1,109 +0,0 @@
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
|
||||
use actix_web::HttpRequest;
|
||||
use meilisearch_types::InstanceUid;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
|
||||
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
|
||||
use crate::Opt;
|
||||
|
||||
pub struct MockAnalytics {
|
||||
instance_uid: Option<InstanceUid>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl SearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SimilarAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl SimilarAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultiSearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl MultiSearchAggregator {
|
||||
pub fn from_federated_search(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FacetSearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl FacetSearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
impl MockAnalytics {
|
||||
#[allow(clippy::new_ret_no_self)]
|
||||
pub fn new(opt: &Opt) -> Arc<dyn Analytics> {
|
||||
let instance_uid = find_user_id(&opt.db_path);
|
||||
Arc::new(Self { instance_uid })
|
||||
}
|
||||
}
|
||||
|
||||
impl Analytics for MockAnalytics {
|
||||
fn instance_uid(&self) -> Option<&meilisearch_types::InstanceUid> {
|
||||
self.instance_uid.as_ref()
|
||||
}
|
||||
|
||||
// These methods are noop and should be optimized out
|
||||
fn publish(&self, _event_name: String, _send: Value, _request: Option<&HttpRequest>) {}
|
||||
fn get_search(&self, _aggregate: super::SearchAggregator) {}
|
||||
fn post_search(&self, _aggregate: super::SearchAggregator) {}
|
||||
fn get_similar(&self, _aggregate: super::SimilarAggregator) {}
|
||||
fn post_similar(&self, _aggregate: super::SimilarAggregator) {}
|
||||
fn post_multi_search(&self, _aggregate: super::MultiSearchAggregator) {}
|
||||
fn post_facet_search(&self, _aggregate: super::FacetSearchAggregator) {}
|
||||
fn add_documents(
|
||||
&self,
|
||||
_documents_query: &UpdateDocumentsQuery,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn delete_documents(&self, _kind: DocumentDeletionKind, _request: &HttpRequest) {}
|
||||
fn update_documents(
|
||||
&self,
|
||||
_documents_query: &UpdateDocumentsQuery,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn update_documents_by_function(
|
||||
&self,
|
||||
_documents_query: &DocumentEditionByFunction,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
}
|
@ -1,334 +0,0 @@
|
||||
use std::fs::{read_dir, read_to_string, remove_file, File};
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Context;
|
||||
use clap::{Parser, Subcommand};
|
||||
use dump::{DumpWriter, IndexMetadata};
|
||||
use file_store::FileStore;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
||||
use meilisearch_types::tasks::{Status, Task};
|
||||
use meilisearch_types::versioning::{get_version, parse_version};
|
||||
use meilisearch_types::Index;
|
||||
use time::macros::format_description;
|
||||
use time::OffsetDateTime;
|
||||
use upgrade::OfflineUpgrade;
|
||||
use uuid_codec::UuidCodec;
|
||||
|
||||
mod upgrade;
|
||||
mod uuid_codec;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Cli {
|
||||
/// The database path where the Meilisearch is running.
|
||||
#[arg(long, default_value = "data.ms/")]
|
||||
db_path: PathBuf,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Command {
|
||||
/// Clears the task queue and make it empty.
|
||||
///
|
||||
/// This command can be safely executed even if Meilisearch is running and processing tasks.
|
||||
/// Once the task queue is empty you can restart Meilisearch and no more tasks must be visible,
|
||||
/// even the ones that were processing. However, it's highly possible that you see the processing
|
||||
/// tasks in the queue again with an associated internal error message.
|
||||
ClearTaskQueue,
|
||||
|
||||
/// Exports a dump from the Meilisearch database.
|
||||
///
|
||||
/// Make sure to run this command when Meilisearch is not running or running but not processing tasks.
|
||||
/// If tasks are being processed while a dump is being exported there are chances for the dump to be
|
||||
/// malformed with missing tasks.
|
||||
///
|
||||
/// TODO Verify this claim or make sure it cannot happen and we can export dumps
|
||||
/// without caring about killing Meilisearch first!
|
||||
ExportADump {
|
||||
/// The directory in which the dump will be created.
|
||||
#[arg(long, default_value = "dumps/")]
|
||||
dump_dir: PathBuf,
|
||||
|
||||
/// Skip dumping the enqueued or processing tasks.
|
||||
///
|
||||
/// Can be useful when there are a lot of them and it is not particularly useful
|
||||
/// to keep them. Note that only the enqueued tasks takes up space so skipping
|
||||
/// the processed ones is not particularly interesting.
|
||||
#[arg(long)]
|
||||
skip_enqueued_tasks: bool,
|
||||
},
|
||||
|
||||
/// Attempts to upgrade from one major version to the next without a dump.
|
||||
///
|
||||
/// Make sure to run this commmand when Meilisearch is not running!
|
||||
/// If Meilisearch is running while executing this command, the database could be corrupted
|
||||
/// (contain data from both the old and the new versions)
|
||||
///
|
||||
/// Supported upgrade paths:
|
||||
///
|
||||
/// - v1.9.x -> v1.10.x -> v1.11.x
|
||||
OfflineUpgrade {
|
||||
#[arg(long)]
|
||||
target_version: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let Cli { db_path, command } = Cli::parse();
|
||||
|
||||
let detected_version = get_version(&db_path).context("While checking the version file")?;
|
||||
|
||||
match command {
|
||||
Command::ClearTaskQueue => clear_task_queue(db_path),
|
||||
Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
|
||||
export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
|
||||
}
|
||||
Command::OfflineUpgrade { target_version } => {
|
||||
let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?;
|
||||
OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clears the task queue located at `db_path`.
|
||||
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
||||
let path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) }
|
||||
.with_context(|| format!("While trying to open {:?}", path.display()))?;
|
||||
|
||||
eprintln!("Deleting tasks from the database...");
|
||||
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let all_tasks = try_opening_poly_database(&env, &wtxn, "all-tasks")?;
|
||||
let total = all_tasks.len(&wtxn)?;
|
||||
let status = try_opening_poly_database(&env, &wtxn, "status")?;
|
||||
let kind = try_opening_poly_database(&env, &wtxn, "kind")?;
|
||||
let index_tasks = try_opening_poly_database(&env, &wtxn, "index-tasks")?;
|
||||
let canceled_by = try_opening_poly_database(&env, &wtxn, "canceled_by")?;
|
||||
let enqueued_at = try_opening_poly_database(&env, &wtxn, "enqueued-at")?;
|
||||
let started_at = try_opening_poly_database(&env, &wtxn, "started-at")?;
|
||||
let finished_at = try_opening_poly_database(&env, &wtxn, "finished-at")?;
|
||||
|
||||
try_clearing_poly_database(&mut wtxn, all_tasks, "all-tasks")?;
|
||||
try_clearing_poly_database(&mut wtxn, status, "status")?;
|
||||
try_clearing_poly_database(&mut wtxn, kind, "kind")?;
|
||||
try_clearing_poly_database(&mut wtxn, index_tasks, "index-tasks")?;
|
||||
try_clearing_poly_database(&mut wtxn, canceled_by, "canceled_by")?;
|
||||
try_clearing_poly_database(&mut wtxn, enqueued_at, "enqueued-at")?;
|
||||
try_clearing_poly_database(&mut wtxn, started_at, "started-at")?;
|
||||
try_clearing_poly_database(&mut wtxn, finished_at, "finished-at")?;
|
||||
|
||||
wtxn.commit().context("While committing the transaction")?;
|
||||
|
||||
eprintln!("Successfully deleted {total} tasks from the tasks database!");
|
||||
eprintln!("Deleting the content files from disk...");
|
||||
|
||||
let mut count = 0usize;
|
||||
let update_files = db_path.join("update_files");
|
||||
let entries = read_dir(&update_files).with_context(|| {
|
||||
format!("While trying to read the content of {:?}", update_files.display())
|
||||
})?;
|
||||
for result in entries {
|
||||
match result {
|
||||
Ok(ent) => match remove_file(ent.path()) {
|
||||
Ok(_) => count += 1,
|
||||
Err(e) => eprintln!("Error while deleting {:?}: {}", ent.path().display(), e),
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!("Error while reading a file in {:?}: {}", update_files.display(), e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Successfully deleted {count} content files from disk!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_opening_database<KC: 'static, DC: 'static>(
|
||||
env: &Env,
|
||||
rtxn: &RoTxn,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<Database<KC, DC>> {
|
||||
env.open_database(rtxn, Some(db_name))
|
||||
.with_context(|| format!("While opening the {db_name:?} database"))?
|
||||
.with_context(|| format!("Missing the {db_name:?} database"))
|
||||
}
|
||||
|
||||
fn try_opening_poly_database(
|
||||
env: &Env,
|
||||
rtxn: &RoTxn,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<Database<Unspecified, Unspecified>> {
|
||||
env.database_options()
|
||||
.name(db_name)
|
||||
.open(rtxn)
|
||||
.with_context(|| format!("While opening the {db_name:?} poly database"))?
|
||||
.with_context(|| format!("Missing the {db_name:?} poly database"))
|
||||
}
|
||||
|
||||
fn try_clearing_poly_database(
|
||||
wtxn: &mut RwTxn,
|
||||
database: Database<Unspecified, Unspecified>,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
|
||||
}
|
||||
|
||||
/// Exports a dump into the dump directory.
|
||||
fn export_a_dump(
|
||||
db_path: PathBuf,
|
||||
dump_dir: PathBuf,
|
||||
skip_enqueued_tasks: bool,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let started_at = OffsetDateTime::now_utc();
|
||||
|
||||
// 1. Extracts the instance UID from disk
|
||||
let instance_uid_path = db_path.join("instance-uid");
|
||||
let instance_uid = match read_to_string(&instance_uid_path) {
|
||||
Ok(content) => match content.trim().parse() {
|
||||
Ok(uuid) => Some(uuid),
|
||||
Err(e) => {
|
||||
eprintln!("Impossible to parse instance-uid: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!("Impossible to read {}: {}", instance_uid_path.display(), e);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let dump = DumpWriter::new(instance_uid).context("While creating a new dump")?;
|
||||
let file_store =
|
||||
FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
|
||||
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
eprintln!("Dumping the keys...");
|
||||
|
||||
// 2. dump the keys
|
||||
let auth_store = AuthController::new(&db_path, &None)
|
||||
.with_context(|| format!("While opening the auth store at {}", db_path.display()))?;
|
||||
let mut dump_keys = dump.create_keys()?;
|
||||
let mut count = 0;
|
||||
for key in auth_store.list_keys()? {
|
||||
dump_keys.push_key(&key)?;
|
||||
count += 1;
|
||||
}
|
||||
dump_keys.flush()?;
|
||||
|
||||
eprintln!("Successfully dumped {count} keys!");
|
||||
|
||||
let rtxn = env.read_txn()?;
|
||||
let all_tasks: Database<BEU32, SerdeJson<Task>> =
|
||||
try_opening_database(&env, &rtxn, "all-tasks")?;
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &rtxn, "index-mapping")?;
|
||||
|
||||
if skip_enqueued_tasks {
|
||||
eprintln!("Skip dumping the enqueued tasks...");
|
||||
} else {
|
||||
eprintln!("Dumping the enqueued tasks...");
|
||||
|
||||
// 3. dump the tasks
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
let mut count = 0;
|
||||
for ret in all_tasks.iter(&rtxn)? {
|
||||
let (_, t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
|
||||
|
||||
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file_uuid) = content_file {
|
||||
if status == Status::Enqueued {
|
||||
let content_file = file_store.get_update(content_file_uuid)?;
|
||||
|
||||
let reader =
|
||||
DocumentsBatchReader::from_reader(content_file).with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
|
||||
let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
while let Some(doc) = cursor.next_document().with_context(|| {
|
||||
format!("While iterating on content file {:?}", content_file_uuid)
|
||||
})? {
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
|
||||
}
|
||||
dump_content_file.flush()?;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
dump_tasks.flush()?;
|
||||
|
||||
eprintln!("Successfully dumped {count} enqueued tasks!");
|
||||
}
|
||||
|
||||
eprintln!("Dumping the indexes...");
|
||||
|
||||
// 4. Dump the indexes
|
||||
let mut count = 0;
|
||||
for result in index_mapping.iter(&rtxn)? {
|
||||
let (uid, uuid) = result?;
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| {
|
||||
format!("While trying to open the index at path {:?}", index_path.display())
|
||||
})?;
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let metadata = IndexMetadata {
|
||||
uid: uid.to_owned(),
|
||||
primary_key: index.primary_key(&rtxn)?.map(String::from),
|
||||
created_at: index.created_at(&rtxn)?,
|
||||
updated_at: index.updated_at(&rtxn)?,
|
||||
};
|
||||
let mut index_dumper = dump.create_index(uid, &metadata)?;
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
// 4.1. Dump the documents
|
||||
for ret in index.all_documents(&rtxn)? {
|
||||
let (_id, doc) = ret?;
|
||||
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
index_dumper.push_document(&document)?;
|
||||
}
|
||||
|
||||
// 4.2. Dump the settings
|
||||
let settings = meilisearch_types::settings::settings(
|
||||
&index,
|
||||
&rtxn,
|
||||
meilisearch_types::settings::SecretPolicy::RevealSecrets,
|
||||
)?;
|
||||
index_dumper.settings(&settings)?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
eprintln!("Successfully dumped {count} indexes!");
|
||||
// We will not dump experimental feature settings
|
||||
eprintln!("The tool is not dumping experimental features, please set them by hand afterward");
|
||||
|
||||
let dump_uid = started_at.format(format_description!(
|
||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||
)).unwrap();
|
||||
|
||||
let path = dump_dir.join(format!("{}.dump", dump_uid));
|
||||
let file = File::create(&path)?;
|
||||
dump.persist_to(BufWriter::new(file))?;
|
||||
|
||||
eprintln!("Dump exported at path {:?}", path.display());
|
||||
|
||||
Ok(())
|
||||
}
|
@ -1,73 +0,0 @@
|
||||
mod v1_10;
|
||||
mod v1_11;
|
||||
mod v1_9;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use meilisearch_types::versioning::create_version_file;
|
||||
|
||||
use v1_10::v1_9_to_v1_10;
|
||||
|
||||
use crate::upgrade::v1_11::v1_10_to_v1_11;
|
||||
|
||||
pub struct OfflineUpgrade {
|
||||
pub db_path: PathBuf,
|
||||
pub current_version: (String, String, String),
|
||||
pub target_version: (String, String, String),
|
||||
}
|
||||
|
||||
impl OfflineUpgrade {
|
||||
pub fn upgrade(self) -> anyhow::Result<()> {
|
||||
let upgrade_list = [
|
||||
(v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"),
|
||||
(v1_10_to_v1_11, "1", "11", "0"),
|
||||
];
|
||||
|
||||
let (current_major, current_minor, current_patch) = &self.current_version;
|
||||
|
||||
let start_at = match (
|
||||
current_major.as_str(),
|
||||
current_minor.as_str(),
|
||||
current_patch.as_str(),
|
||||
) {
|
||||
("1", "9", _) => 0,
|
||||
("1", "10", _) => 1,
|
||||
_ => {
|
||||
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
|
||||
}
|
||||
};
|
||||
|
||||
let (target_major, target_minor, target_patch) = &self.target_version;
|
||||
|
||||
let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
|
||||
("1", "10", _) => 0,
|
||||
("1", "11", _) => 1,
|
||||
(major, _, _) if major.starts_with('v') => {
|
||||
bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
|
||||
}
|
||||
_ => {
|
||||
bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11")
|
||||
}
|
||||
};
|
||||
|
||||
println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");
|
||||
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for index in start_at..=ends_at {
|
||||
let (func, major, minor, patch) = upgrade_list[index];
|
||||
(func)(&self.db_path)?;
|
||||
println!("Done");
|
||||
// We're writing the version file just in case an issue arise _while_ upgrading.
|
||||
// We don't want the DB to fail in an unknown state.
|
||||
println!("Writing VERSION file");
|
||||
|
||||
create_version_file(&self.db_path, major, minor, patch)
|
||||
.context("while writing VERSION file after the upgrade")?;
|
||||
}
|
||||
|
||||
println!("Success");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,289 +0,0 @@
|
||||
use anyhow::bail;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use meilisearch_types::{
|
||||
heed::{
|
||||
types::{SerdeJson, Str},
|
||||
Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
|
||||
},
|
||||
milli::index::{db_name, main_key},
|
||||
};
|
||||
|
||||
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
|
||||
|
||||
use super::v1_9;
|
||||
|
||||
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
||||
|
||||
/// The statistics that can be computed from an `Index` object.
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
pub struct IndexStats {
|
||||
/// Number of documents in the index.
|
||||
pub number_of_documents: u64,
|
||||
/// Size taken up by the index' DB, in bytes.
|
||||
///
|
||||
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||
/// `used_database_size` that only includes the size of the used pages.
|
||||
pub database_size: u64,
|
||||
/// Size taken by the used pages of the index' DB, in bytes.
|
||||
///
|
||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||
/// this value is typically smaller than `database_size`.
|
||||
pub used_database_size: u64,
|
||||
/// Association of every field name with the number of times it occurs in the documents.
|
||||
pub field_distribution: FieldDistribution,
|
||||
/// Creation date of the index.
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: time::OffsetDateTime,
|
||||
/// Date of the last update of the index.
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: time::OffsetDateTime,
|
||||
}
|
||||
|
||||
impl From<v1_9::IndexStats> for IndexStats {
|
||||
fn from(
|
||||
v1_9::IndexStats {
|
||||
number_of_documents,
|
||||
database_size,
|
||||
used_database_size,
|
||||
field_distribution,
|
||||
created_at,
|
||||
updated_at,
|
||||
}: v1_9::IndexStats,
|
||||
) -> Self {
|
||||
IndexStats {
|
||||
number_of_documents,
|
||||
database_size,
|
||||
used_database_size,
|
||||
field_distribution,
|
||||
created_at: created_at.0,
|
||||
updated_at: updated_at.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime);
|
||||
|
||||
fn update_index_stats(
|
||||
index_stats: Database<UuidCodec, Unspecified>,
|
||||
index_uid: &str,
|
||||
index_uuid: uuid::Uuid,
|
||||
sched_wtxn: &mut RwTxn,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = || format!("while updating index stats for index `{index_uid}`");
|
||||
|
||||
let stats: Option<&str> = index_stats
|
||||
.remap_data_type::<Str>()
|
||||
.get(sched_wtxn, &index_uuid)
|
||||
.with_context(ctx)
|
||||
.with_context(|| "While reading value")?;
|
||||
dbg!(stats);
|
||||
|
||||
let stats: Option<v1_9::IndexStats> = index_stats
|
||||
.remap_data_type::<SerdeJson<v1_9::IndexStats>>()
|
||||
.get(sched_wtxn, &index_uuid)
|
||||
.with_context(ctx)
|
||||
.with_context(|| "While reading value")?;
|
||||
|
||||
if let Some(stats) = stats {
|
||||
let stats: self::IndexStats = stats.into();
|
||||
|
||||
index_stats
|
||||
.remap_data_type::<SerdeJson<self::IndexStats>>()
|
||||
.put(sched_wtxn, &index_uuid, &stats)
|
||||
.with_context(ctx)
|
||||
.with_context(|| "While writing value")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_date_format(
|
||||
index_uid: &str,
|
||||
index_env: &Env,
|
||||
index_wtxn: &mut RwTxn,
|
||||
) -> anyhow::Result<()> {
|
||||
let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN)
|
||||
.with_context(|| format!("while updating date format for index `{index_uid}`"))?;
|
||||
|
||||
date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?;
|
||||
date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_rest_embedders(
|
||||
index_uid: &str,
|
||||
index_env: &Env,
|
||||
index_txn: &RoTxn,
|
||||
) -> anyhow::Result<Vec<String>> {
|
||||
let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN)
|
||||
.with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?;
|
||||
|
||||
let mut rest_embedders = vec![];
|
||||
|
||||
for config in main
|
||||
.remap_types::<Str, SerdeJson<Vec<v1_9::IndexEmbeddingConfig>>>()
|
||||
.get(index_txn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default()
|
||||
{
|
||||
if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options {
|
||||
rest_embedders.push(config.name);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rest_embedders)
|
||||
}
|
||||
|
||||
fn date_round_trip(
|
||||
wtxn: &mut RwTxn,
|
||||
index_uid: &str,
|
||||
db: Database<Unspecified, Unspecified>,
|
||||
key: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let datetime =
|
||||
db.remap_types::<Str, SerdeJson<v1_9::LegacyDateTime>>().get(wtxn, key).with_context(
|
||||
|| format!("could not read `{key}` while updating date format for index `{index_uid}`"),
|
||||
)?;
|
||||
|
||||
if let Some(datetime) = datetime {
|
||||
db.remap_types::<Str, SerdeJson<self::OffsetDateTime>>()
|
||||
.put(wtxn, key, &self::OffsetDateTime(datetime.0))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"could not write `{key}` while updating date format for index `{index_uid}`"
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> {
|
||||
println!("Upgrading from v1.9.0 to v1.10.0");
|
||||
// 2 changes here
|
||||
|
||||
// 1. date format. needs to be done before opening the Index
|
||||
// 2. REST embedders. We don't support this case right now, so bail
|
||||
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
let mut sched_wtxn = env.write_txn()?;
|
||||
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &sched_wtxn, "index-mapping")?;
|
||||
|
||||
let index_stats: Database<UuidCodec, Unspecified> =
|
||||
try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| {
|
||||
format!("While trying to open {:?}", index_scheduler_path.display())
|
||||
})?;
|
||||
|
||||
let index_count =
|
||||
index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?;
|
||||
|
||||
// FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn
|
||||
// 1. immutably for the iteration
|
||||
// 2. mutably for updating index stats
|
||||
let indexes: Vec<_> = index_mapping
|
||||
.iter(&sched_wtxn)?
|
||||
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
|
||||
.collect();
|
||||
|
||||
let mut rest_embedders = Vec::new();
|
||||
|
||||
let mut unwrapped_indexes = Vec::new();
|
||||
|
||||
// check that update can take place
|
||||
for (index_index, result) in indexes.into_iter().enumerate() {
|
||||
let (uid, uuid) = result?;
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
|
||||
println!(
|
||||
"[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`",
|
||||
index_index + 1,
|
||||
index_path.display()
|
||||
);
|
||||
|
||||
let index_env = unsafe {
|
||||
// FIXME: fetch the 25 magic number from the index file
|
||||
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||
format!("while opening index {uid} at '{}'", index_path.display())
|
||||
})?
|
||||
};
|
||||
|
||||
let index_txn = index_env.read_txn().with_context(|| {
|
||||
format!(
|
||||
"while obtaining a write transaction for index {uid} at {}",
|
||||
index_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
println!("\t- Checking for incompatible embedders (REST embedders)");
|
||||
let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?;
|
||||
|
||||
if rest_embedders_for_index.is_empty() {
|
||||
unwrapped_indexes.push((uid, uuid));
|
||||
} else {
|
||||
// no need to add to unwrapped indexes because we'll exit early
|
||||
rest_embedders.push((uid, rest_embedders_for_index));
|
||||
}
|
||||
}
|
||||
|
||||
if !rest_embedders.is_empty() {
|
||||
let rest_embedders = rest_embedders
|
||||
.into_iter()
|
||||
.flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders))
|
||||
.map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\
|
||||
The database has not been modified and is still a valid v1.9 database.");
|
||||
}
|
||||
|
||||
println!("Update can take place, updating");
|
||||
|
||||
for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() {
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
|
||||
println!(
|
||||
"[{}/{index_count}]Updating index `{uid}` at `{}`",
|
||||
index_index + 1,
|
||||
index_path.display()
|
||||
);
|
||||
|
||||
let index_env = unsafe {
|
||||
// FIXME: fetch the 25 magic number from the index file
|
||||
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||
format!("while opening index {uid} at '{}'", index_path.display())
|
||||
})?
|
||||
};
|
||||
|
||||
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
||||
format!(
|
||||
"while obtaining a write transaction for index `{uid}` at `{}`",
|
||||
index_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
println!("\t- Updating index stats");
|
||||
update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?;
|
||||
println!("\t- Updating date format");
|
||||
update_date_format(&uid, &index_env, &mut index_wtxn)?;
|
||||
|
||||
index_wtxn.commit().with_context(|| {
|
||||
format!("while committing the write txn for index `{uid}` at {}", index_path.display())
|
||||
})?;
|
||||
}
|
||||
|
||||
sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?;
|
||||
|
||||
println!("Upgrading database succeeded");
|
||||
|
||||
Ok(())
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
//! The breaking changes that happened between the v1.10 and the v1.11 are:
|
||||
//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0
|
||||
//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata.
|
||||
//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB.
|
||||
//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use meilisearch_types::{
|
||||
heed::{types::Str, Database, EnvOpenOptions},
|
||||
milli::index::db_name,
|
||||
};
|
||||
|
||||
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
|
||||
|
||||
pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> {
|
||||
println!("Upgrading from v1.10.0 to v1.11.0");
|
||||
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
let sched_rtxn = env.read_txn()?;
|
||||
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &sched_rtxn, "index-mapping")?;
|
||||
|
||||
let index_count =
|
||||
index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?;
|
||||
|
||||
let indexes: Vec<_> = index_mapping
|
||||
.iter(&sched_rtxn)?
|
||||
.map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid)))
|
||||
.collect();
|
||||
|
||||
for (index_index, result) in indexes.into_iter().enumerate() {
|
||||
let (uid, uuid) = result?;
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
|
||||
println!(
|
||||
"[{}/{index_count}]Updating embeddings for `{uid}` at `{}`",
|
||||
index_index + 1,
|
||||
index_path.display()
|
||||
);
|
||||
|
||||
let index_env = unsafe {
|
||||
EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| {
|
||||
format!("while opening index {uid} at '{}'", index_path.display())
|
||||
})?
|
||||
};
|
||||
|
||||
let index_rtxn = index_env.read_txn().with_context(|| {
|
||||
format!(
|
||||
"while obtaining a read transaction for index {uid} at {}",
|
||||
index_path.display()
|
||||
)
|
||||
})?;
|
||||
let index_read_database =
|
||||
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
|
||||
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||
|
||||
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
||||
format!(
|
||||
"while obtaining a write transaction for index {uid} at {}",
|
||||
index_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let index_write_database =
|
||||
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
|
||||
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||
|
||||
arroy_v04_to_v05::ugrade_from_prev_version(
|
||||
&index_rtxn,
|
||||
index_read_database,
|
||||
&mut index_wtxn,
|
||||
index_write_database,
|
||||
)?;
|
||||
|
||||
index_wtxn.commit()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -1,158 +0,0 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::{Date, OffsetDateTime, Time, UtcOffset};
|
||||
|
||||
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;
|
||||
|
||||
/// The statistics that can be computed from an `Index` object.
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
pub struct IndexStats {
|
||||
/// Number of documents in the index.
|
||||
pub number_of_documents: u64,
|
||||
/// Size taken up by the index' DB, in bytes.
|
||||
///
|
||||
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
|
||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||
/// `used_database_size` that only includes the size of the used pages.
|
||||
pub database_size: u64,
|
||||
/// Size taken by the used pages of the index' DB, in bytes.
|
||||
///
|
||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||
/// this value is typically smaller than `database_size`.
|
||||
pub used_database_size: u64,
|
||||
/// Association of every field name with the number of times it occurs in the documents.
|
||||
pub field_distribution: FieldDistribution,
|
||||
/// Creation date of the index.
|
||||
pub created_at: LegacyDateTime,
|
||||
/// Date of the last update of the index.
|
||||
pub updated_at: LegacyDateTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct IndexEmbeddingConfig {
|
||||
pub name: String,
|
||||
pub config: EmbeddingConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbeddingConfig {
|
||||
/// Options of the embedder, specific to each kind of embedder
|
||||
pub embedder_options: EmbedderOptions,
|
||||
}
|
||||
|
||||
/// Options of an embedder, specific to each kind of embedder.
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub enum EmbedderOptions {
|
||||
HuggingFace(hf::EmbedderOptions),
|
||||
OpenAi(openai::EmbedderOptions),
|
||||
Ollama(ollama::EmbedderOptions),
|
||||
UserProvided(manual::EmbedderOptions),
|
||||
Rest(rest::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl Default for EmbedderOptions {
|
||||
fn default() -> Self {
|
||||
Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None })
|
||||
}
|
||||
}
|
||||
|
||||
mod hf {
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub model: String,
|
||||
pub revision: Option<String>,
|
||||
}
|
||||
}
|
||||
mod openai {
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub api_key: Option<String>,
|
||||
pub dimensions: Option<usize>,
|
||||
}
|
||||
}
|
||||
mod ollama {
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub embedding_model: String,
|
||||
pub url: Option<String>,
|
||||
pub api_key: Option<String>,
|
||||
}
|
||||
}
|
||||
mod manual {
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub dimensions: usize,
|
||||
}
|
||||
}
|
||||
mod rest {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
|
||||
pub struct EmbedderOptions {
|
||||
pub api_key: Option<String>,
|
||||
pub dimensions: Option<usize>,
|
||||
pub url: String,
|
||||
pub input_field: Vec<String>,
|
||||
// path to the array of embeddings
|
||||
pub path_to_embeddings: Vec<String>,
|
||||
// shape of a single embedding
|
||||
pub embedding_object: Vec<String>,
|
||||
}
|
||||
}
|
||||
|
||||
/// A datetime from Meilisearch v1.9 with an unspecified format.
|
||||
#[derive(Debug)]
|
||||
pub struct LegacyDateTime(pub OffsetDateTime);
|
||||
|
||||
impl<'de> Deserialize<'de> for LegacyDateTime {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct Visitor;
|
||||
impl<'de> serde::de::Visitor<'de> for Visitor {
|
||||
type Value = OffsetDateTime;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(formatter, "a valid datetime")
|
||||
}
|
||||
|
||||
// Comes from a binary. The legacy format is:
|
||||
// 2024-11-04 13:32:08.48368 +00:00:00
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]");
|
||||
OffsetDateTime::parse(v, format).map_err(E::custom)
|
||||
}
|
||||
|
||||
// Comes from the docker image, the legacy format is:
|
||||
// [2024, 309, 17, 15, 1, 698184971, 0,0,0]
|
||||
// year, day in year, hour, minute, sec, subsec , offset stuff
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let mut vec = Vec::new();
|
||||
// We must deserialize the value as `i64` because the largest values are `u32` and `i32`
|
||||
while let Some(el) = seq.next_element::<i64>()? {
|
||||
vec.push(el);
|
||||
}
|
||||
if vec.len() != 9 {
|
||||
return Err(serde::de::Error::custom(format!(
|
||||
"Invalid datetime, received an array of {} elements instead of 9",
|
||||
vec.len()
|
||||
)));
|
||||
}
|
||||
Ok(OffsetDateTime::new_in_offset(
|
||||
Date::from_ordinal_date(vec[0] as i32, vec[1] as u16)
|
||||
.map_err(serde::de::Error::custom)?,
|
||||
Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32)
|
||||
.map_err(serde::de::Error::custom)?,
|
||||
UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8)
|
||||
.map_err(serde::de::Error::custom)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
deserializer.deserialize_any(Visitor).map(LegacyDateTime)
|
||||
}
|
||||
}
|
@ -1,139 +0,0 @@
|
||||
use super::matching_words::WordId;
|
||||
use super::{Match, MatchPosition};
|
||||
|
||||
struct MatchIntervalWithScore {
|
||||
interval: [usize; 2],
|
||||
score: [i16; 3],
|
||||
}
|
||||
|
||||
// count score for phrases
|
||||
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
|
||||
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||
// will always be ordered, so +1 for each space between words
|
||||
*order_score += words_in_phrase_minus_one;
|
||||
// distance will always be 1, so -1 for each space between words
|
||||
*distance_score -= words_in_phrase_minus_one;
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
let mut iter = matches.iter().peekable();
|
||||
while let Some(m) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
// if matches are ordered
|
||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||
order_score += 1;
|
||||
}
|
||||
|
||||
let m_last_word_pos = match m.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
lwp
|
||||
}
|
||||
};
|
||||
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||
|
||||
// compute distance between matches
|
||||
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
|
||||
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
|
||||
// in case last match is a phrase, count score for its words
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
}
|
||||
|
||||
ids.extend(m.ids.iter());
|
||||
}
|
||||
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
let uniq_score = ids.len() as i16;
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
[uniq_score, distance_score, order_score]
|
||||
}
|
||||
|
||||
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
|
||||
if matches.is_empty() {
|
||||
panic!("`matches` should not be empty at this point");
|
||||
}
|
||||
|
||||
// positions of the first and the last match of the best matches interval in `matches`.
|
||||
let mut best_interval: Option<MatchIntervalWithScore> = None;
|
||||
|
||||
let mut save_best_interval = |interval_first, interval_last| {
|
||||
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
|
||||
let is_interval_score_better = &best_interval
|
||||
.as_ref()
|
||||
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
|
||||
|
||||
if *is_interval_score_better {
|
||||
best_interval = Some(MatchIntervalWithScore {
|
||||
interval: [interval_first, interval_last],
|
||||
score: interval_score,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// we compute the matches interval if we have at least 2 matches.
|
||||
// current interval positions.
|
||||
let mut interval_first = 0;
|
||||
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
for (index, next_match) in matches.iter().enumerate() {
|
||||
// if next match would make interval gross more than crop_size,
|
||||
// we compare the current interval with the best one,
|
||||
// then we increase `interval_first` until next match can be added.
|
||||
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||
|
||||
// if the next match would mean that we pass the crop size window,
|
||||
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||
// and calculate a score for it, and check if it's better than our best so far
|
||||
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
|
||||
// if index is 0 there is no last viable match
|
||||
if index != 0 {
|
||||
let interval_last = index - 1;
|
||||
// keep interval if it's the best
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// advance start of the interval while interval is longer than crop_size.
|
||||
loop {
|
||||
interval_first += 1;
|
||||
if interval_first == matches.len() {
|
||||
interval_first -= 1;
|
||||
break;
|
||||
}
|
||||
|
||||
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
if interval_first_match_first_word_pos > next_match_last_word_pos
|
||||
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// compute the last interval score and compare it to the best one.
|
||||
let interval_last = matches.len() - 1;
|
||||
// if it's the last match with itself, we need to make sure it's
|
||||
// not a phrase longer than the crop window
|
||||
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// if none of the matches fit the criteria above, default to the first one
|
||||
best_interval.map_or(
|
||||
[&matches[0], &matches[0]],
|
||||
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
|
||||
)
|
||||
}
|
@ -1,62 +0,0 @@
|
||||
use super::matching_words::WordId;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum MatchPosition {
|
||||
Word {
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
token_position: usize,
|
||||
},
|
||||
Phrase {
|
||||
// position of the first and last word in the phrase in the whole text.
|
||||
word_positions: [usize; 2],
|
||||
// position of the first and last token in the phrase in the whole text.
|
||||
token_positions: [usize; 2],
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Match {
|
||||
pub char_count: usize,
|
||||
// ids of the query words that matches.
|
||||
pub ids: Vec<WordId>,
|
||||
pub position: MatchPosition,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub(super) fn get_first_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_first_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_word_count(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { .. } => 1,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
use charabia::{SeparatorKind, Token, TokenKind};
|
||||
|
||||
pub enum SimpleTokenKind {
|
||||
Separator(SeparatorKind),
|
||||
NotSeparator,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
pub fn new(token: &&Token<'_>) -> Self {
|
||||
match token.kind {
|
||||
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
|
||||
_ => Self::NotSeparator,
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user