Merge branch 'main' into cymruu/payload-unit-test

This commit is contained in:
Filip Bachul 2023-06-21 09:26:50 +02:00
commit 9015a8e8d9
107 changed files with 3889 additions and 2136 deletions

View File

@ -2,4 +2,3 @@ target
Dockerfile Dockerfile
.dockerignore .dockerignore
.gitignore .gitignore
**/.git

View File

@ -1,24 +1,41 @@
#!/bin/bash #!/usr/bin/env bash
set -eu -o pipefail
# check_tag $current_tag $file_tag $file_name check_tag() {
function check_tag { local expected=$1
if [[ "$1" != "$2" ]]; then local actual=$2
echo "Error: the current tag does not match the version in Cargo.toml: found $2 - expected $1" local filename=$3
ret=1
fi if [[ $actual != $expected ]]; then
echo >&2 "Error: the current tag does not match the version in $filename: found $actual, expected $expected"
return 1
fi
} }
read_version() {
grep '^version = ' | cut -d \" -f 2
}
if [[ -z "${GITHUB_REF:-}" ]]; then
echo >&2 "Error: GITHUB_REF is not set"
exit 1
fi
if [[ ! "$GITHUB_REF" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+(-[a-z0-9]+)?$ ]]; then
echo >&2 "Error: GITHUB_REF is not a valid tag: $GITHUB_REF"
exit 1
fi
current_tag=${GITHUB_REF#refs/tags/v}
ret=0 ret=0
current_tag=${GITHUB_REF#'refs/tags/v'}
file_tag="$(grep '^version = ' Cargo.toml | cut -d '=' -f 2 | tr -d '"' | tr -d ' ')" toml_tag="$(cat Cargo.toml | read_version)"
check_tag $current_tag $file_tag check_tag "$current_tag" "$toml_tag" Cargo.toml || ret=1
lock_file='Cargo.lock' lock_tag=$(grep -A 1 '^name = "meilisearch-auth"' Cargo.lock | read_version)
lock_tag=$(grep -A 1 'name = "meilisearch-auth"' $lock_file | grep version | cut -d '=' -f 2 | tr -d '"' | tr -d ' ') check_tag "$current_tag" "$lock_tag" Cargo.lock || ret=1
check_tag $current_tag $lock_tag $lock_file
if [[ "$ret" -eq 0 ]] ; then if (( ret == 0 )); then
echo 'OK' echo 'OK'
fi fi
exit $ret exit $ret

24
.github/workflows/fuzzer-indexing.yml vendored Normal file
View File

@ -0,0 +1,24 @@
name: Run the indexing fuzzer
on:
push:
branches:
- main
jobs:
fuzz:
name: Setup the action
runs-on: ubuntu-latest
timeout-minutes: 4320 # 72h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
# Run benchmarks
- name: Run the fuzzer
run: |
cargo run --release --bin fuzz-indexing

View File

@ -35,7 +35,7 @@ jobs:
- name: Build deb package - name: Build deb package
run: cargo deb -p meilisearch -o target/debian/meilisearch.deb run: cargo deb -p meilisearch -o target/debian/meilisearch.deb
- name: Upload debian pkg to release - name: Upload debian pkg to release
uses: svenstaro/upload-release-action@2.5.0 uses: svenstaro/upload-release-action@2.6.1
with: with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/debian/meilisearch.deb file: target/debian/meilisearch.deb

View File

@ -54,7 +54,7 @@ jobs:
# No need to upload binaries for dry run (cron) # No need to upload binaries for dry run (cron)
- name: Upload binaries to release - name: Upload binaries to release
if: github.event_name == 'release' if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.5.0 uses: svenstaro/upload-release-action@2.6.1
with: with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/release/meilisearch file: target/release/meilisearch
@ -87,7 +87,7 @@ jobs:
# No need to upload binaries for dry run (cron) # No need to upload binaries for dry run (cron)
- name: Upload binaries to release - name: Upload binaries to release
if: github.event_name == 'release' if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.5.0 uses: svenstaro/upload-release-action@2.6.1
with: with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/release/${{ matrix.artifact_name }} file: target/release/${{ matrix.artifact_name }}
@ -121,7 +121,7 @@ jobs:
- name: Upload the binary to release - name: Upload the binary to release
# No need to upload binaries for dry run (cron) # No need to upload binaries for dry run (cron)
if: github.event_name == 'release' if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.5.0 uses: svenstaro/upload-release-action@2.6.1
with: with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/${{ matrix.target }}/release/meilisearch file: target/${{ matrix.target }}/release/meilisearch
@ -183,7 +183,7 @@ jobs:
- name: Upload the binary to release - name: Upload the binary to release
# No need to upload binaries for dry run (cron) # No need to upload binaries for dry run (cron)
if: github.event_name == 'release' if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.5.0 uses: svenstaro/upload-release-action@2.6.1
with: with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/${{ matrix.target }}/release/meilisearch file: target/${{ matrix.target }}/release/meilisearch

View File

@ -58,13 +58,9 @@ jobs:
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v2 uses: docker/setup-qemu-action@v2
with:
platforms: linux/amd64,linux/arm64
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2 uses: docker/setup-buildx-action@v2
with:
platforms: linux/amd64,linux/arm64
- name: Login to Docker Hub - name: Login to Docker Hub
uses: docker/login-action@v2 uses: docker/login-action@v2
@ -92,13 +88,10 @@ jobs:
push: true push: true
platforms: linux/amd64,linux/arm64 platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
builder: ${{ steps.buildx.outputs.name }}
build-args: | build-args: |
COMMIT_SHA=${{ github.sha }} COMMIT_SHA=${{ github.sha }}
COMMIT_DATE=${{ steps.build-metadata.outputs.date }} COMMIT_DATE=${{ steps.build-metadata.outputs.date }}
GIT_TAG=${{ github.ref_name }} GIT_TAG=${{ github.ref_name }}
cache-from: type=gha
cache-to: type=gha,mode=max
# /!\ Don't touch this without checking with Cloud team # /!\ Don't touch this without checking with Cloud team
- name: Send CI information to Cloud team - name: Send CI information to Cloud team

View File

@ -3,6 +3,11 @@ name: SDKs tests
on: on:
workflow_dispatch: workflow_dispatch:
inputs:
docker_image:
description: 'The Meilisearch Docker image used'
required: false
default: nightly
schedule: schedule:
- cron: "0 6 * * MON" # Every Monday at 6:00AM - cron: "0 6 * * MON" # Every Monday at 6:00AM
@ -11,13 +16,28 @@ env:
MEILI_NO_ANALYTICS: 'true' MEILI_NO_ANALYTICS: 'true'
jobs: jobs:
define-docker-image:
runs-on: ubuntu-latest
outputs:
docker-image: ${{ steps.define-image.outputs.docker-image }}
steps:
- uses: actions/checkout@v3
- name: Define the Docker image we need to use
id: define-image
run: |
event=${{ github.event_name }}
echo "docker-image=nightly" >> $GITHUB_OUTPUT
if [[ $event == 'workflow_dispatch' ]]; then
echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
fi
meilisearch-js-tests: meilisearch-js-tests:
needs: define-docker-image
name: JS SDK tests name: JS SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -47,11 +67,12 @@ jobs:
run: yarn test:env:browser run: yarn test:env:browser
instant-meilisearch-tests: instant-meilisearch-tests:
needs: define-docker-image
name: instant-meilisearch tests name: instant-meilisearch tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -73,11 +94,12 @@ jobs:
run: yarn build run: yarn build
meilisearch-php-tests: meilisearch-php-tests:
needs: define-docker-image
name: PHP SDK tests name: PHP SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -103,11 +125,12 @@ jobs:
composer remove --dev guzzlehttp/guzzle http-interop/http-factory-guzzle composer remove --dev guzzlehttp/guzzle http-interop/http-factory-guzzle
meilisearch-python-tests: meilisearch-python-tests:
needs: define-docker-image
name: Python SDK tests name: Python SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -127,11 +150,12 @@ jobs:
run: pipenv run pytest run: pipenv run pytest
meilisearch-go-tests: meilisearch-go-tests:
needs: define-docker-image
name: Go SDK tests name: Go SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -139,7 +163,7 @@ jobs:
- '7700:7700' - '7700:7700'
steps: steps:
- name: Set up Go - name: Set up Go
uses: actions/setup-go@v3 uses: actions/setup-go@v4
with: with:
go-version: stable go-version: stable
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@ -156,11 +180,12 @@ jobs:
run: go test -v ./... run: go test -v ./...
meilisearch-ruby-tests: meilisearch-ruby-tests:
needs: define-docker-image
name: Ruby SDK tests name: Ruby SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -180,11 +205,12 @@ jobs:
run: bundle exec rspec run: bundle exec rspec
meilisearch-rust-tests: meilisearch-rust-tests:
needs: define-docker-image
name: Rust SDK tests name: Rust SDK tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
services: services:
meilisearch: meilisearch:
image: getmeili/meilisearch:nightly image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env: env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }} MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }} MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}

View File

@ -43,7 +43,7 @@ jobs:
toolchain: nightly toolchain: nightly
override: true override: true
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.2.1 uses: Swatinem/rust-cache@v2.4.0
- name: Run cargo check without any default features - name: Run cargo check without any default features
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -65,7 +65,7 @@ jobs:
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.2.1 uses: Swatinem/rust-cache@v2.4.0
- name: Run cargo check without any default features - name: Run cargo check without any default features
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -105,6 +105,29 @@ jobs:
command: test command: test
args: --workspace --locked --release --all-features args: --workspace --locked --release --all-features
test-disabled-tokenization:
name: Test disabled tokenization
runs-on: ubuntu-latest
container:
image: ubuntu:18.04
if: github.event_name == 'schedule'
steps:
- uses: actions/checkout@v3
- name: Install needed dependencies
run: |
apt-get update
apt-get install --assume-yes build-essential curl
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Run cargo tree without default features and check lindera is not present
run: |
cargo tree -f '{p} {f}' -e normal --no-default-features | grep lindera -vqz
- name: Run cargo tree with default features and check lindera is pressent
run: |
cargo tree -f '{p} {f}' -e normal | grep lindera -qz
# We run tests in debug also, to make sure that the debug_assertions are hit # We run tests in debug also, to make sure that the debug_assertions are hit
test-debug: test-debug:
name: Run tests in debug name: Run tests in debug
@ -123,7 +146,7 @@ jobs:
toolchain: stable toolchain: stable
override: true override: true
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.2.1 uses: Swatinem/rust-cache@v2.4.0
- name: Run tests in debug - name: Run tests in debug
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -142,7 +165,7 @@ jobs:
override: true override: true
components: clippy components: clippy
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.2.1 uses: Swatinem/rust-cache@v2.4.0
- name: Run cargo clippy - name: Run cargo clippy
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -161,7 +184,7 @@ jobs:
override: true override: true
components: rustfmt components: rustfmt
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.2.1 uses: Swatinem/rust-cache@v2.4.0
- name: Run cargo fmt - name: Run cargo fmt
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate

732
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,8 @@ members = [
"filter-parser", "filter-parser",
"flatten-serde-json", "flatten-serde-json",
"json-depth-checker", "json-depth-checker",
"benchmarks" "benchmarks",
"fuzzers",
] ]
[workspace.package] [workspace.package]

View File

@ -1,4 +1,3 @@
# syntax=docker/dockerfile:1.4
# Compile # Compile
FROM rust:alpine3.16 AS compiler FROM rust:alpine3.16 AS compiler
@ -12,7 +11,7 @@ ARG GIT_TAG
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG} ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG}
ENV RUSTFLAGS="-C target-feature=-crt-static" ENV RUSTFLAGS="-C target-feature=-crt-static"
COPY --link . . COPY . .
RUN set -eux; \ RUN set -eux; \
apkArch="$(apk --print-arch)"; \ apkArch="$(apk --print-arch)"; \
if [ "$apkArch" = "aarch64" ]; then \ if [ "$apkArch" = "aarch64" ]; then \
@ -31,7 +30,7 @@ RUN apk update --quiet \
# add meilisearch to the `/bin` so you can run it from anywhere and it's easy # add meilisearch to the `/bin` so you can run it from anywhere and it's easy
# to find. # to find.
COPY --from=compiler --link /meilisearch/target/release/meilisearch /bin/meilisearch COPY --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
# To stay compatible with the older version of the container (pre v0.27.0) we're # To stay compatible with the older version of the container (pre v0.27.0) we're
# going to symlink the meilisearch binary in the path to `/meilisearch` # going to symlink the meilisearch binary in the path to `/meilisearch`
RUN ln -s /bin/meilisearch /meilisearch RUN ln -s /bin/meilisearch /meilisearch

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'meilisearch'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:7700']

View File

@ -1,128 +1,131 @@
# This file shows the default configuration of Meilisearch. # This file shows the default configuration of Meilisearch.
# All variables are defined here: https://www.meilisearch.com/docs/learn/configuration/instance_options#environment-variables # All variables are defined here: https://www.meilisearch.com/docs/learn/configuration/instance_options#environment-variables
db_path = "./data.ms"
# Designates the location where database files will be created and retrieved. # Designates the location where database files will be created and retrieved.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#database-path # https://www.meilisearch.com/docs/learn/configuration/instance_options#database-path
db_path = "./data.ms"
env = "development"
# Configures the instance's environment. Value must be either `production` or `development`. # Configures the instance's environment. Value must be either `production` or `development`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#environment # https://www.meilisearch.com/docs/learn/configuration/instance_options#environment
env = "development"
http_addr = "localhost:7700"
# The address on which the HTTP server will listen. # The address on which the HTTP server will listen.
http_addr = "localhost:7700"
# master_key = "YOUR_MASTER_KEY_VALUE"
# Sets the instance's master key, automatically protecting all routes except GET /health. # Sets the instance's master key, automatically protecting all routes except GET /health.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#master-key # https://www.meilisearch.com/docs/learn/configuration/instance_options#master-key
# master_key = "YOUR_MASTER_KEY_VALUE"
# no_analytics = true
# Deactivates Meilisearch's built-in telemetry when provided. # Deactivates Meilisearch's built-in telemetry when provided.
# Meilisearch automatically collects data from all instances that do not opt out using this flag. # Meilisearch automatically collects data from all instances that do not opt out using this flag.
# All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted at any time. # All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted at any time.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#disable-analytics # https://www.meilisearch.com/docs/learn/configuration/instance_options#disable-analytics
# no_analytics = true
http_payload_size_limit = "100 MB"
# Sets the maximum size of accepted payloads. # Sets the maximum size of accepted payloads.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#payload-limit-size # https://www.meilisearch.com/docs/learn/configuration/instance_options#payload-limit-size
http_payload_size_limit = "100 MB"
log_level = "INFO"
# Defines how much detail should be present in Meilisearch's logs. # Defines how much detail should be present in Meilisearch's logs.
# Meilisearch currently supports six log levels, listed in order of increasing verbosity: `OFF`, `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE` # Meilisearch currently supports six log levels, listed in order of increasing verbosity: `OFF`, `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`
# https://www.meilisearch.com/docs/learn/configuration/instance_options#log-level # https://www.meilisearch.com/docs/learn/configuration/instance_options#log-level
log_level = "INFO"
# max_indexing_memory = "2 GiB"
# Sets the maximum amount of RAM Meilisearch can use when indexing. # Sets the maximum amount of RAM Meilisearch can use when indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-memory # https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-memory
# max_indexing_memory = "2 GiB"
# max_indexing_threads = 4
# Sets the maximum number of threads Meilisearch can use during indexing. # Sets the maximum number of threads Meilisearch can use during indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-threads # https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-threads
# max_indexing_threads = 4
############# #############
### DUMPS ### ### DUMPS ###
############# #############
dump_dir = "dumps/"
# Sets the directory where Meilisearch will create dump files. # Sets the directory where Meilisearch will create dump files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#dump-directory # https://www.meilisearch.com/docs/learn/configuration/instance_options#dump-directory
dump_dir = "dumps/"
# import_dump = "./path/to/my/file.dump"
# Imports the dump file located at the specified path. Path must point to a .dump file. # Imports the dump file located at the specified path. Path must point to a .dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-dump # https://www.meilisearch.com/docs/learn/configuration/instance_options#import-dump
# import_dump = "./path/to/my/file.dump"
ignore_missing_dump = false
# Prevents Meilisearch from throwing an error when `import_dump` does not point to a valid dump file. # Prevents Meilisearch from throwing an error when `import_dump` does not point to a valid dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-dump # https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-dump
ignore_missing_dump = false
ignore_dump_if_db_exists = false
# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_dump`. # Prevents a Meilisearch instance with an existing database from throwing an error when using `import_dump`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-dump-if-db-exists # https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-dump-if-db-exists
ignore_dump_if_db_exists = false
################# #################
### SNAPSHOTS ### ### SNAPSHOTS ###
################# #################
schedule_snapshot = false
# Enables scheduled snapshots when true, disable when false (the default). # Enables scheduled snapshots when true, disable when false (the default).
# If the value is given as an integer, then enables the scheduled snapshot with the passed value as the interval # If the value is given as an integer, then enables the scheduled snapshot with the passed value as the interval
# between each snapshot, in seconds. # between each snapshot, in seconds.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#schedule-snapshot-creation # https://www.meilisearch.com/docs/learn/configuration/instance_options#schedule-snapshot-creation
schedule_snapshot = false
snapshot_dir = "snapshots/"
# Sets the directory where Meilisearch will store snapshots. # Sets the directory where Meilisearch will store snapshots.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#snapshot-destination # https://www.meilisearch.com/docs/learn/configuration/instance_options#snapshot-destination
snapshot_dir = "snapshots/"
# import_snapshot = "./path/to/my/snapshot"
# Launches Meilisearch after importing a previously-generated snapshot at the given filepath. # Launches Meilisearch after importing a previously-generated snapshot at the given filepath.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-snapshot # https://www.meilisearch.com/docs/learn/configuration/instance_options#import-snapshot
# import_snapshot = "./path/to/my/snapshot"
ignore_missing_snapshot = false
# Prevents a Meilisearch instance from throwing an error when `import_snapshot` does not point to a valid snapshot file. # Prevents a Meilisearch instance from throwing an error when `import_snapshot` does not point to a valid snapshot file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-snapshot # https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-snapshot
ignore_missing_snapshot = false
ignore_snapshot_if_db_exists = false
# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_snapshot`. # Prevents a Meilisearch instance with an existing database from throwing an error when using `import_snapshot`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-snapshot-if-db-exists # https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-snapshot-if-db-exists
ignore_snapshot_if_db_exists = false
########### ###########
### SSL ### ### SSL ###
########### ###########
# ssl_auth_path = "./path/to/root"
# Enables client authentication in the specified path. # Enables client authentication in the specified path.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-authentication-path # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-authentication-path
# ssl_auth_path = "./path/to/root"
# ssl_cert_path = "./path/to/certfile"
# Sets the server's SSL certificates. # Sets the server's SSL certificates.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-certificates-path # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-certificates-path
# ssl_cert_path = "./path/to/certfile"
# ssl_key_path = "./path/to/private-key"
# Sets the server's SSL key files. # Sets the server's SSL key files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-key-path # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-key-path
# ssl_key_path = "./path/to/private-key"
# ssl_ocsp_path = "./path/to/ocsp-file"
# Sets the server's OCSP file. # Sets the server's OCSP file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-ocsp-path # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-ocsp-path
# ssl_ocsp_path = "./path/to/ocsp-file"
ssl_require_auth = false
# Makes SSL authentication mandatory. # Makes SSL authentication mandatory.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-require-auth # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-require-auth
ssl_require_auth = false
ssl_resumption = false
# Activates SSL session resumption. # Activates SSL session resumption.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-resumption # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-resumption
ssl_resumption = false
ssl_tickets = false
# Activates SSL tickets. # Activates SSL tickets.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-tickets # https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-tickets
ssl_tickets = false
############################# #############################
### Experimental features ### ### Experimental features ###
############################# #############################
experimental_enable_metrics = false
# Experimental metrics feature. For more information, see: <https://github.com/meilisearch/meilisearch/discussions/3518> # Experimental metrics feature. For more information, see: <https://github.com/meilisearch/meilisearch/discussions/3518>
# Enables the Prometheus metrics on the `GET /metrics` endpoint. # Enables the Prometheus metrics on the `GET /metrics` endpoint.
experimental_enable_metrics = false
# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
experimental_reduce_indexing_memory_usage = false

20
fuzzers/Cargo.toml Normal file
View File

@ -0,0 +1,20 @@
[package]
name = "fuzzers"
publish = false
version.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
readme.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
arbitrary = { version = "1.3.0", features = ["derive"] }
clap = { version = "4.3.0", features = ["derive"] }
fastrand = "1.9.0"
milli = { path = "../milli" }
serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] }
tempfile = "3.5.0"

3
fuzzers/README.md Normal file
View File

@ -0,0 +1,3 @@
# Fuzzers
The purpose of this crate is to contains all the handmade "fuzzer" we may need.

View File

@ -0,0 +1,152 @@
use std::num::NonZeroUsize;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::time::Duration;
use arbitrary::{Arbitrary, Unstructured};
use clap::Parser;
use fuzzers::Operation;
use milli::heed::EnvOpenOptions;
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
use milli::Index;
use tempfile::TempDir;
#[derive(Debug, Arbitrary)]
struct Batch([Operation; 5]);
#[derive(Debug, Clone, Parser)]
struct Opt {
/// The number of fuzzer to run in parallel.
#[clap(long)]
par: Option<NonZeroUsize>,
// We need to put a lot of newlines in the following documentation or else everything gets collapsed on one line
/// The path in which the databases will be created.
/// Using a ramdisk is recommended.
///
/// Linux:
///
/// sudo mount -t tmpfs -o size=2g tmpfs ramdisk # to create it
///
/// sudo umount ramdisk # to remove it
///
/// MacOS:
///
/// diskutil erasevolume HFS+ 'RAM Disk' `hdiutil attach -nobrowse -nomount ram://4194304 # create it
///
/// hdiutil detach /dev/:the_disk
#[clap(long)]
path: Option<PathBuf>,
}
fn main() {
let opt = Opt::parse();
let progression: &'static AtomicUsize = Box::leak(Box::new(AtomicUsize::new(0)));
let stop: &'static AtomicBool = Box::leak(Box::new(AtomicBool::new(false)));
let par = opt.par.unwrap_or_else(|| std::thread::available_parallelism().unwrap()).get();
let mut handles = Vec::with_capacity(par);
for _ in 0..par {
let opt = opt.clone();
let handle = std::thread::spawn(move || {
let mut options = EnvOpenOptions::new();
options.map_size(1024 * 1024 * 1024 * 1024);
let tempdir = match opt.path {
Some(path) => TempDir::new_in(path).unwrap(),
None => TempDir::new().unwrap(),
};
let index = Index::new(options, tempdir.path()).unwrap();
let indexer_config = IndexerConfig::default();
let index_documents_config = IndexDocumentsConfig::default();
std::thread::scope(|s| {
loop {
if stop.load(Ordering::Relaxed) {
return;
}
let v: Vec<u8> =
std::iter::repeat_with(|| fastrand::u8(..)).take(1000).collect();
let mut data = Unstructured::new(&v);
let batches = <[Batch; 5]>::arbitrary(&mut data).unwrap();
// will be used to display the error once a thread crashes
let dbg_input = format!("{:#?}", batches);
let handle = s.spawn(|| {
let mut wtxn = index.write_txn().unwrap();
for batch in batches {
let mut builder = IndexDocuments::new(
&mut wtxn,
&index,
&indexer_config,
index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
for op in batch.0 {
match op {
Operation::AddDoc(doc) => {
let documents =
milli::documents::objects_from_json_value(doc.to_d());
let documents =
milli::documents::documents_batch_reader_from_objects(
documents,
);
let (b, _added) = builder.add_documents(documents).unwrap();
builder = b;
}
Operation::DeleteDoc(id) => {
let (b, _removed) =
builder.remove_documents(vec![id.to_s()]).unwrap();
builder = b;
}
}
}
builder.execute().unwrap();
// after executing a batch we check if the database is corrupted
let res = index.search(&wtxn).execute().unwrap();
index.documents(&wtxn, res.documents_ids).unwrap();
progression.fetch_add(1, Ordering::Relaxed);
}
wtxn.abort().unwrap();
});
if let err @ Err(_) = handle.join() {
stop.store(true, Ordering::Relaxed);
err.expect(&dbg_input);
}
}
});
});
handles.push(handle);
}
std::thread::spawn(|| {
let mut last_value = 0;
let start = std::time::Instant::now();
loop {
let total = progression.load(Ordering::Relaxed);
let elapsed = start.elapsed().as_secs();
if elapsed > 3600 {
// after 1 hour, stop the fuzzer, success
std::process::exit(0);
}
println!(
"Has been running for {:?} seconds. Tested {} new values for a total of {}.",
elapsed,
total - last_value,
total
);
last_value = total;
std::thread::sleep(Duration::from_secs(1));
}
});
for handle in handles {
handle.join().unwrap();
}
}

46
fuzzers/src/lib.rs Normal file
View File

@ -0,0 +1,46 @@
use arbitrary::Arbitrary;
use serde_json::{json, Value};
#[derive(Debug, Arbitrary)]
pub enum Document {
One,
Two,
Three,
Four,
Five,
Six,
}
impl Document {
pub fn to_d(&self) -> Value {
match self {
Document::One => json!({ "id": 0, "doggo": "bernese" }),
Document::Two => json!({ "id": 0, "doggo": "golden" }),
Document::Three => json!({ "id": 0, "catto": "jorts" }),
Document::Four => json!({ "id": 1, "doggo": "bernese" }),
Document::Five => json!({ "id": 1, "doggo": "golden" }),
Document::Six => json!({ "id": 1, "catto": "jorts" }),
}
}
}
#[derive(Debug, Arbitrary)]
pub enum DocId {
Zero,
One,
}
impl DocId {
pub fn to_s(&self) -> String {
match self {
DocId::Zero => "0".to_string(),
DocId::One => "1".to_string(),
}
}
}
#[derive(Debug, Arbitrary)]
pub enum Operation {
AddDoc(Document),
DeleteDoc(DocId),
}

File diff suppressed because it is too large Load Diff

View File

@ -321,9 +321,18 @@ impl BatchKind {
}) })
} }
( (
this @ BatchKind::DocumentOperation { .. }, BatchKind::DocumentOperation { method, allow_index_creation, primary_key, mut operation_ids },
K::DocumentDeletion, K::DocumentDeletion,
) => Break(this), ) => {
operation_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids,
})
}
// but we can't autobatch documents if it's not the same kind // but we can't autobatch documents if it's not the same kind
// this match branch MUST be AFTER the previous one // this match branch MUST be AFTER the previous one
( (
@ -346,7 +355,35 @@ impl BatchKind {
deletion_ids.push(id); deletion_ids.push(id);
Continue(BatchKind::DocumentClear { ids: deletion_ids }) Continue(BatchKind::DocumentClear { ids: deletion_ids })
} }
// we can't autobatch a deletion and an import // we can autobatch the deletion and import if the index already exists
(
BatchKind::DocumentDeletion { mut deletion_ids },
K::DocumentImport { method, allow_index_creation, primary_key }
) if index_already_exists => {
deletion_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids: deletion_ids,
})
}
// we can autobatch the deletion and import if both can't create an index
(
BatchKind::DocumentDeletion { mut deletion_ids },
K::DocumentImport { method, allow_index_creation, primary_key }
) if !allow_index_creation => {
deletion_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids: deletion_ids,
})
}
// we can't autobatch a deletion and an import if the index does not exists but would be created by an addition
( (
this @ BatchKind::DocumentDeletion { .. }, this @ BatchKind::DocumentDeletion { .. },
K::DocumentImport { .. } K::DocumentImport { .. }
@ -648,36 +685,36 @@ mod tests {
debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))"); debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))");
// We can't autobatch document addition with document deletion // We can autobatch document addition with document deletion
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
// we also can't do the only way around // And the other way around
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
} }
#[test] #[test]

View File

@ -24,6 +24,7 @@ use std::io::BufWriter;
use dump::IndexMetadata; use dump::IndexMetadata;
use log::{debug, error, info}; use log::{debug, error, info};
use meilisearch_types::error::Code;
use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::heed::{RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::heed::CompactionOption;
@ -1491,7 +1492,12 @@ fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result
Ok(if let Some(filter) = filter { Ok(if let Some(filter) = filter {
let mut wtxn = index.write_txn()?; let mut wtxn = index.write_txn()?;
let candidates = filter.evaluate(&wtxn, &index)?; let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
}
e => e.into(),
})?;
let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?; let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
delete_operation.delete_documents(&candidates); delete_operation.delete_documents(&candidates);
let deleted_documents = let deleted_documents =

View File

@ -46,6 +46,8 @@ impl From<DateField> for Code {
#[allow(clippy::large_enum_variant)] #[allow(clippy::large_enum_variant)]
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum Error { pub enum Error {
#[error("{1}")]
WithCustomErrorCode(Code, Box<Self>),
#[error("Index `{0}` not found.")] #[error("Index `{0}` not found.")]
IndexNotFound(String), IndexNotFound(String),
#[error("Index `{0}` already exists.")] #[error("Index `{0}` already exists.")]
@ -144,6 +146,7 @@ impl Error {
pub fn is_recoverable(&self) -> bool { pub fn is_recoverable(&self) -> bool {
match self { match self {
Error::IndexNotFound(_) Error::IndexNotFound(_)
| Error::WithCustomErrorCode(_, _)
| Error::IndexAlreadyExists(_) | Error::IndexAlreadyExists(_)
| Error::SwapDuplicateIndexFound(_) | Error::SwapDuplicateIndexFound(_)
| Error::SwapDuplicateIndexesFound(_) | Error::SwapDuplicateIndexesFound(_)
@ -176,11 +179,16 @@ impl Error {
Error::PlannedFailure => false, Error::PlannedFailure => false,
} }
} }
pub fn with_custom_error_code(self, code: Code) -> Self {
Self::WithCustomErrorCode(code, Box::new(self))
}
} }
impl ErrorCode for Error { impl ErrorCode for Error {
fn error_code(&self) -> Code { fn error_code(&self) -> Code {
match self { match self {
Error::WithCustomErrorCode(code, _) => *code,
Error::IndexNotFound(_) => Code::IndexNotFound, Error::IndexNotFound(_) => Code::IndexNotFound,
Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists, Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists,
Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound, Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound,

View File

@ -5,6 +5,7 @@ use std::collections::BTreeMap;
use std::path::Path; use std::path::Path;
use std::time::Duration; use std::time::Duration;
use meilisearch_types::heed::flags::Flags;
use meilisearch_types::heed::{EnvClosingEvent, EnvOpenOptions}; use meilisearch_types::heed::{EnvClosingEvent, EnvOpenOptions};
use meilisearch_types::milli::Index; use meilisearch_types::milli::Index;
use time::OffsetDateTime; use time::OffsetDateTime;
@ -53,6 +54,7 @@ pub struct IndexMap {
pub struct ClosingIndex { pub struct ClosingIndex {
uuid: Uuid, uuid: Uuid,
closing_event: EnvClosingEvent, closing_event: EnvClosingEvent,
enable_mdb_writemap: bool,
map_size: usize, map_size: usize,
generation: usize, generation: usize,
} }
@ -68,6 +70,7 @@ impl ClosingIndex {
pub fn wait_timeout(self, timeout: Duration) -> Option<ReopenableIndex> { pub fn wait_timeout(self, timeout: Duration) -> Option<ReopenableIndex> {
self.closing_event.wait_timeout(timeout).then_some(ReopenableIndex { self.closing_event.wait_timeout(timeout).then_some(ReopenableIndex {
uuid: self.uuid, uuid: self.uuid,
enable_mdb_writemap: self.enable_mdb_writemap,
map_size: self.map_size, map_size: self.map_size,
generation: self.generation, generation: self.generation,
}) })
@ -76,6 +79,7 @@ impl ClosingIndex {
pub struct ReopenableIndex { pub struct ReopenableIndex {
uuid: Uuid, uuid: Uuid,
enable_mdb_writemap: bool,
map_size: usize, map_size: usize,
generation: usize, generation: usize,
} }
@ -103,7 +107,7 @@ impl ReopenableIndex {
return Ok(()); return Ok(());
} }
map.unavailable.remove(&self.uuid); map.unavailable.remove(&self.uuid);
map.create(&self.uuid, path, None, self.map_size)?; map.create(&self.uuid, path, None, self.enable_mdb_writemap, self.map_size)?;
} }
Ok(()) Ok(())
} }
@ -170,16 +174,17 @@ impl IndexMap {
uuid: &Uuid, uuid: &Uuid,
path: &Path, path: &Path,
date: Option<(OffsetDateTime, OffsetDateTime)>, date: Option<(OffsetDateTime, OffsetDateTime)>,
enable_mdb_writemap: bool,
map_size: usize, map_size: usize,
) -> Result<Index> { ) -> Result<Index> {
if !matches!(self.get_unavailable(uuid), Missing) { if !matches!(self.get_unavailable(uuid), Missing) {
panic!("Attempt to open an index that was unavailable"); panic!("Attempt to open an index that was unavailable");
} }
let index = create_or_open_index(path, date, map_size)?; let index = create_or_open_index(path, date, enable_mdb_writemap, map_size)?;
match self.available.insert(*uuid, index.clone()) { match self.available.insert(*uuid, index.clone()) {
InsertionOutcome::InsertedNew => (), InsertionOutcome::InsertedNew => (),
InsertionOutcome::Evicted(evicted_uuid, evicted_index) => { InsertionOutcome::Evicted(evicted_uuid, evicted_index) => {
self.close(evicted_uuid, evicted_index, 0); self.close(evicted_uuid, evicted_index, enable_mdb_writemap, 0);
} }
InsertionOutcome::Replaced(_) => { InsertionOutcome::Replaced(_) => {
panic!("Attempt to open an index that was already opened") panic!("Attempt to open an index that was already opened")
@ -212,17 +217,30 @@ impl IndexMap {
/// | Closing | Closing | /// | Closing | Closing |
/// | Available | Closing | /// | Available | Closing |
/// ///
pub fn close_for_resize(&mut self, uuid: &Uuid, map_size_growth: usize) { pub fn close_for_resize(
&mut self,
uuid: &Uuid,
enable_mdb_writemap: bool,
map_size_growth: usize,
) {
let Some(index) = self.available.remove(uuid) else { return; }; let Some(index) = self.available.remove(uuid) else { return; };
self.close(*uuid, index, map_size_growth); self.close(*uuid, index, enable_mdb_writemap, map_size_growth);
} }
fn close(&mut self, uuid: Uuid, index: Index, map_size_growth: usize) { fn close(
&mut self,
uuid: Uuid,
index: Index,
enable_mdb_writemap: bool,
map_size_growth: usize,
) {
let map_size = index.map_size().unwrap_or(DEFAULT_MAP_SIZE) + map_size_growth; let map_size = index.map_size().unwrap_or(DEFAULT_MAP_SIZE) + map_size_growth;
let closing_event = index.prepare_for_closing(); let closing_event = index.prepare_for_closing();
let generation = self.next_generation(); let generation = self.next_generation();
self.unavailable self.unavailable.insert(
.insert(uuid, Some(ClosingIndex { uuid, closing_event, map_size, generation })); uuid,
Some(ClosingIndex { uuid, closing_event, enable_mdb_writemap, map_size, generation }),
);
} }
/// Attempts to delete and index. /// Attempts to delete and index.
@ -282,11 +300,15 @@ impl IndexMap {
fn create_or_open_index( fn create_or_open_index(
path: &Path, path: &Path,
date: Option<(OffsetDateTime, OffsetDateTime)>, date: Option<(OffsetDateTime, OffsetDateTime)>,
enable_mdb_writemap: bool,
map_size: usize, map_size: usize,
) -> Result<Index> { ) -> Result<Index> {
let mut options = EnvOpenOptions::new(); let mut options = EnvOpenOptions::new();
options.map_size(clamp_to_page_size(map_size)); options.map_size(clamp_to_page_size(map_size));
options.max_readers(1024); options.max_readers(1024);
if enable_mdb_writemap {
unsafe { options.flag(Flags::MdbWriteMap) };
}
if let Some((created, updated)) = date { if let Some((created, updated)) = date {
Ok(Index::new_with_creation_dates(options, path, created, updated)?) Ok(Index::new_with_creation_dates(options, path, created, updated)?)

View File

@ -66,6 +66,8 @@ pub struct IndexMapper {
index_base_map_size: usize, index_base_map_size: usize,
/// The quantity by which the map size of an index is incremented upon reopening, in bytes. /// The quantity by which the map size of an index is incremented upon reopening, in bytes.
index_growth_amount: usize, index_growth_amount: usize,
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
enable_mdb_writemap: bool,
pub indexer_config: Arc<IndexerConfig>, pub indexer_config: Arc<IndexerConfig>,
} }
@ -88,8 +90,17 @@ pub enum IndexStatus {
pub struct IndexStats { pub struct IndexStats {
/// Number of documents in the index. /// Number of documents in the index.
pub number_of_documents: u64, pub number_of_documents: u64,
/// Size of the index' DB, in bytes. /// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
pub database_size: u64, pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents. /// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution, pub field_distribution: FieldDistribution,
/// Creation date of the index. /// Creation date of the index.
@ -105,10 +116,10 @@ impl IndexStats {
/// ///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> Result<Self> { pub fn new(index: &Index, rtxn: &RoTxn) -> Result<Self> {
let database_size = index.on_disk_size()?;
Ok(IndexStats { Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?, number_of_documents: index.number_of_documents(rtxn)?,
database_size, database_size: index.on_disk_size()?,
used_database_size: index.used_size()?,
field_distribution: index.field_distribution(rtxn)?, field_distribution: index.field_distribution(rtxn)?,
created_at: index.created_at(rtxn)?, created_at: index.created_at(rtxn)?,
updated_at: index.updated_at(rtxn)?, updated_at: index.updated_at(rtxn)?,
@ -123,15 +134,22 @@ impl IndexMapper {
index_base_map_size: usize, index_base_map_size: usize,
index_growth_amount: usize, index_growth_amount: usize,
index_count: usize, index_count: usize,
enable_mdb_writemap: bool,
indexer_config: IndexerConfig, indexer_config: IndexerConfig,
) -> Result<Self> { ) -> Result<Self> {
let mut wtxn = env.write_txn()?;
let index_mapping = env.create_database(&mut wtxn, Some(INDEX_MAPPING))?;
let index_stats = env.create_database(&mut wtxn, Some(INDEX_STATS))?;
wtxn.commit()?;
Ok(Self { Ok(Self {
index_map: Arc::new(RwLock::new(IndexMap::new(index_count))), index_map: Arc::new(RwLock::new(IndexMap::new(index_count))),
index_mapping: env.create_database(Some(INDEX_MAPPING))?, index_mapping,
index_stats: env.create_database(Some(INDEX_STATS))?, index_stats,
base_path, base_path,
index_base_map_size, index_base_map_size,
index_growth_amount, index_growth_amount,
enable_mdb_writemap,
indexer_config: Arc::new(indexer_config), indexer_config: Arc::new(indexer_config),
}) })
} }
@ -162,6 +180,7 @@ impl IndexMapper {
&uuid, &uuid,
&index_path, &index_path,
date, date,
self.enable_mdb_writemap,
self.index_base_map_size, self.index_base_map_size,
)?; )?;
@ -273,7 +292,11 @@ impl IndexMapper {
.ok_or_else(|| Error::IndexNotFound(name.to_string()))?; .ok_or_else(|| Error::IndexNotFound(name.to_string()))?;
// We remove the index from the in-memory index map. // We remove the index from the in-memory index map.
self.index_map.write().unwrap().close_for_resize(&uuid, self.index_growth_amount); self.index_map.write().unwrap().close_for_resize(
&uuid,
self.enable_mdb_writemap,
self.index_growth_amount,
);
Ok(()) Ok(())
} }
@ -338,6 +361,7 @@ impl IndexMapper {
&uuid, &uuid,
&index_path, &index_path,
None, None,
self.enable_mdb_writemap,
self.index_base_map_size, self.index_base_map_size,
)?; )?;
} }

View File

@ -31,7 +31,7 @@ mod uuid_codec;
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
pub type TaskId = u32; pub type TaskId = u32;
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::ops::{Bound, RangeBounds}; use std::ops::{Bound, RangeBounds};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicBool;
@ -233,6 +233,8 @@ pub struct IndexSchedulerOptions {
pub task_db_size: usize, pub task_db_size: usize,
/// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index. /// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index.
pub index_base_map_size: usize, pub index_base_map_size: usize,
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
pub enable_mdb_writemap: bool,
/// The size, in bytes, by which the map size of an index is increased when it resized due to being full. /// The size, in bytes, by which the map size of an index is increased when it resized due to being full.
pub index_growth_amount: usize, pub index_growth_amount: usize,
/// The number of indexes that can be concurrently opened in memory. /// The number of indexes that can be concurrently opened in memory.
@ -374,6 +376,11 @@ impl IndexScheduler {
std::fs::create_dir_all(&options.indexes_path)?; std::fs::create_dir_all(&options.indexes_path)?;
std::fs::create_dir_all(&options.dumps_path)?; std::fs::create_dir_all(&options.dumps_path)?;
if cfg!(windows) && options.enable_mdb_writemap {
// programmer error if this happens: in normal use passing the option on Windows is an error in main
panic!("Windows doesn't support the MDB_WRITEMAP LMDB option");
}
let task_db_size = clamp_to_page_size(options.task_db_size); let task_db_size = clamp_to_page_size(options.task_db_size);
let budget = if options.indexer_config.skip_index_budget { let budget = if options.indexer_config.skip_index_budget {
IndexBudget { IndexBudget {
@ -396,25 +403,37 @@ impl IndexScheduler {
.open(options.tasks_path)?; .open(options.tasks_path)?;
let file_store = FileStore::new(&options.update_file_path)?; let file_store = FileStore::new(&options.update_file_path)?;
let mut wtxn = env.write_txn()?;
let all_tasks = env.create_database(&mut wtxn, Some(db_name::ALL_TASKS))?;
let status = env.create_database(&mut wtxn, Some(db_name::STATUS))?;
let kind = env.create_database(&mut wtxn, Some(db_name::KIND))?;
let index_tasks = env.create_database(&mut wtxn, Some(db_name::INDEX_TASKS))?;
let canceled_by = env.create_database(&mut wtxn, Some(db_name::CANCELED_BY))?;
let enqueued_at = env.create_database(&mut wtxn, Some(db_name::ENQUEUED_AT))?;
let started_at = env.create_database(&mut wtxn, Some(db_name::STARTED_AT))?;
let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
wtxn.commit()?;
// allow unreachable_code to get rids of the warning in the case of a test build. // allow unreachable_code to get rids of the warning in the case of a test build.
let this = Self { let this = Self {
must_stop_processing: MustStopProcessing::default(), must_stop_processing: MustStopProcessing::default(),
processing_tasks: Arc::new(RwLock::new(ProcessingTasks::new())), processing_tasks: Arc::new(RwLock::new(ProcessingTasks::new())),
file_store, file_store,
all_tasks: env.create_database(Some(db_name::ALL_TASKS))?, all_tasks,
status: env.create_database(Some(db_name::STATUS))?, status,
kind: env.create_database(Some(db_name::KIND))?, kind,
index_tasks: env.create_database(Some(db_name::INDEX_TASKS))?, index_tasks,
canceled_by: env.create_database(Some(db_name::CANCELED_BY))?, canceled_by,
enqueued_at: env.create_database(Some(db_name::ENQUEUED_AT))?, enqueued_at,
started_at: env.create_database(Some(db_name::STARTED_AT))?, started_at,
finished_at: env.create_database(Some(db_name::FINISHED_AT))?, finished_at,
index_mapper: IndexMapper::new( index_mapper: IndexMapper::new(
&env, &env,
options.indexes_path, options.indexes_path,
budget.map_size, budget.map_size,
options.index_growth_amount, options.index_growth_amount,
budget.index_count, budget.index_count,
options.enable_mdb_writemap,
options.indexer_config, options.indexer_config,
)?, )?,
env, env,
@ -554,10 +573,16 @@ impl IndexScheduler {
&self.index_mapper.indexer_config &self.index_mapper.indexer_config
} }
/// Return the real database size (i.e.: The size **with** the free pages)
pub fn size(&self) -> Result<u64> { pub fn size(&self) -> Result<u64> {
Ok(self.env.real_disk_size()?) Ok(self.env.real_disk_size()?)
} }
/// Return the used database size (i.e.: The size **without** the free pages)
pub fn used_size(&self) -> Result<u64> {
Ok(self.env.non_free_pages_size()?)
}
/// Return the index corresponding to the name. /// Return the index corresponding to the name.
/// ///
/// * If the index wasn't opened before, the index will be opened. /// * If the index wasn't opened before, the index will be opened.
@ -737,6 +762,38 @@ impl IndexScheduler {
Ok(tasks) Ok(tasks)
} }
/// The returned structure contains:
/// 1. The name of the property being observed can be `statuses`, `types`, or `indexes`.
/// 2. The name of the specific data related to the property can be `enqueued` for the `statuses`, `settingsUpdate` for the `types`, or the name of the index for the `indexes`, for example.
/// 3. The number of times the properties appeared.
pub fn get_stats(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> {
let rtxn = self.read_txn()?;
let mut res = BTreeMap::new();
res.insert(
"statuses".to_string(),
enum_iterator::all::<Status>()
.map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
res.insert(
"types".to_string(),
enum_iterator::all::<Kind>()
.map(|s| Ok((s.to_string(), self.get_kind(&rtxn, s)?.len())))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
res.insert(
"indexes".to_string(),
self.index_tasks
.iter(&rtxn)?
.map(|res| Ok(res.map(|(name, bitmap)| (name.to_string(), bitmap.len()))?))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
Ok(res)
}
/// Return true iff there is at least one task associated with this index /// Return true iff there is at least one task associated with this index
/// that is processing. /// that is processing.
pub fn is_index_processing(&self, index: &str) -> Result<bool> { pub fn is_index_processing(&self, index: &str) -> Result<bool> {
@ -1471,6 +1528,7 @@ mod tests {
dumps_path: tempdir.path().join("dumps"), dumps_path: tempdir.path().join("dumps"),
task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
enable_mdb_writemap: false,
index_growth_amount: 1000 * 1000, // 1 MB index_growth_amount: 1000 * 1000, // 1 MB
index_count: 5, index_count: 5,
indexer_config, indexer_config,
@ -2017,6 +2075,105 @@ mod tests {
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded");
} }
#[test]
fn document_addition_and_document_deletion() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = r#"[
{ "id": 1, "doggo": "jean bob" },
{ "id": 2, "catto": "jorts" },
{ "id": 3, "doggo": "bork" }
]"#;
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
file.persist().unwrap();
index_scheduler
.register(KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: Some(S("id")),
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
index_scheduler
.register(KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1"), S("2")],
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch");
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
}
#[test]
fn document_deletion_and_document_addition() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
index_scheduler
.register(KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1"), S("2")],
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
let content = r#"[
{ "id": 1, "doggo": "jean bob" },
{ "id": 2, "catto": "jorts" },
{ "id": 3, "doggo": "bork" }
]"#;
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
file.persist().unwrap();
index_scheduler
.register(KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: Some(S("id")),
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
// The deletion should have failed because it can't create an index
handle.advance_one_failed_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion");
// The addition should works
handle.advance_one_successful_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition");
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
}
#[test] #[test]
fn do_not_batch_task_of_different_indexes() { fn do_not_batch_task_of_different_indexes() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);

View File

@ -0,0 +1,43 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,9 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -0,0 +1,37 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,40 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,43 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,46 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,17 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 1,
"doggo": "jean bob"
},
{
"id": 2,
"catto": "jorts"
},
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -0,0 +1,36 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,40 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -466,7 +466,7 @@ impl IndexScheduler {
} }
} }
Details::DocumentDeletionByFilter { deleted_documents, original_filter: _ } => { Details::DocumentDeletionByFilter { deleted_documents, original_filter: _ } => {
assert_eq!(kind.as_kind(), Kind::DocumentDeletionByFilter); assert_eq!(kind.as_kind(), Kind::DocumentDeletion);
let (index_uid, _) = if let KindWithContent::DocumentDeletionByFilter { let (index_uid, _) = if let KindWithContent::DocumentDeletionByFilter {
ref index_uid, ref index_uid,
ref filter_expr, ref filter_expr,

View File

@ -45,6 +45,11 @@ impl AuthController {
self.store.size() self.store.size()
} }
/// Return the used size of the `AuthController` database in bytes.
pub fn used_size(&self) -> Result<u64> {
self.store.used_size()
}
pub fn create_key(&self, create_key: CreateApiKey) -> Result<Key> { pub fn create_key(&self, create_key: CreateApiKey) -> Result<Key> {
match self.store.get_api_key(create_key.uid)? { match self.store.get_api_key(create_key.uid)? {
Some(_) => Err(AuthControllerError::ApiKeyAlreadyExists(create_key.uid.to_string())), Some(_) => Err(AuthControllerError::ApiKeyAlreadyExists(create_key.uid.to_string())),

View File

@ -55,9 +55,11 @@ impl HeedAuthStore {
let path = path.as_ref().join(AUTH_DB_PATH); let path = path.as_ref().join(AUTH_DB_PATH);
create_dir_all(&path)?; create_dir_all(&path)?;
let env = Arc::new(open_auth_store_env(path.as_ref())?); let env = Arc::new(open_auth_store_env(path.as_ref())?);
let keys = env.create_database(Some(KEY_DB_NAME))?; let mut wtxn = env.write_txn()?;
let keys = env.create_database(&mut wtxn, Some(KEY_DB_NAME))?;
let action_keyid_index_expiration = let action_keyid_index_expiration =
env.create_database(Some(KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME))?; env.create_database(&mut wtxn, Some(KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME))?;
wtxn.commit()?;
Ok(Self { env, keys, action_keyid_index_expiration, should_close_on_drop: true }) Ok(Self { env, keys, action_keyid_index_expiration, should_close_on_drop: true })
} }
@ -73,6 +75,11 @@ impl HeedAuthStore {
Ok(self.env.real_disk_size()?) Ok(self.env.real_disk_size()?)
} }
/// Return the number of bytes actually used in the database
pub fn used_size(&self) -> Result<u64> {
Ok(self.env.non_free_pages_size()?)
}
pub fn set_drop_on_close(&mut self, v: bool) { pub fn set_drop_on_close(&mut self, v: bool) {
self.should_close_on_drop = v; self.should_close_on_drop = v;
} }

View File

@ -150,6 +150,7 @@ make_missing_field_convenience_builder!(MissingApiKeyActions, missing_api_key_ac
make_missing_field_convenience_builder!(MissingApiKeyExpiresAt, missing_api_key_expires_at); make_missing_field_convenience_builder!(MissingApiKeyExpiresAt, missing_api_key_expires_at);
make_missing_field_convenience_builder!(MissingApiKeyIndexes, missing_api_key_indexes); make_missing_field_convenience_builder!(MissingApiKeyIndexes, missing_api_key_indexes);
make_missing_field_convenience_builder!(MissingSwapIndexes, missing_swap_indexes); make_missing_field_convenience_builder!(MissingSwapIndexes, missing_swap_indexes);
make_missing_field_convenience_builder!(MissingDocumentFilter, missing_document_filter);
// Integrate a sub-error into a [`DeserrError`] by taking its error message but using // Integrate a sub-error into a [`DeserrError`] by taking its error message but using
// the default error code (C) from `Self` // the default error code (C) from `Self`

View File

@ -214,12 +214,12 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ;
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
InvalidDocumentId , InvalidRequest , BAD_REQUEST ; InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ; InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ; InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
InvalidDocumentDeleteFilter , InvalidRequest , BAD_REQUEST ;
InvalidIndexLimit , InvalidRequest , BAD_REQUEST ; InvalidIndexLimit , InvalidRequest , BAD_REQUEST ;
InvalidIndexOffset , InvalidRequest , BAD_REQUEST ; InvalidIndexOffset , InvalidRequest , BAD_REQUEST ;
InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ; InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ;

View File

@ -395,7 +395,6 @@ impl std::error::Error for ParseTaskStatusError {}
pub enum Kind { pub enum Kind {
DocumentAdditionOrUpdate, DocumentAdditionOrUpdate,
DocumentDeletion, DocumentDeletion,
DocumentDeletionByFilter,
SettingsUpdate, SettingsUpdate,
IndexCreation, IndexCreation,
IndexDeletion, IndexDeletion,
@ -412,7 +411,6 @@ impl Kind {
match self { match self {
Kind::DocumentAdditionOrUpdate Kind::DocumentAdditionOrUpdate
| Kind::DocumentDeletion | Kind::DocumentDeletion
| Kind::DocumentDeletionByFilter
| Kind::SettingsUpdate | Kind::SettingsUpdate
| Kind::IndexCreation | Kind::IndexCreation
| Kind::IndexDeletion | Kind::IndexDeletion
@ -430,7 +428,6 @@ impl Display for Kind {
match self { match self {
Kind::DocumentAdditionOrUpdate => write!(f, "documentAdditionOrUpdate"), Kind::DocumentAdditionOrUpdate => write!(f, "documentAdditionOrUpdate"),
Kind::DocumentDeletion => write!(f, "documentDeletion"), Kind::DocumentDeletion => write!(f, "documentDeletion"),
Kind::DocumentDeletionByFilter => write!(f, "documentDeletionByFilter"),
Kind::SettingsUpdate => write!(f, "settingsUpdate"), Kind::SettingsUpdate => write!(f, "settingsUpdate"),
Kind::IndexCreation => write!(f, "indexCreation"), Kind::IndexCreation => write!(f, "indexCreation"),
Kind::IndexDeletion => write!(f, "indexDeletion"), Kind::IndexDeletion => write!(f, "indexDeletion"),

View File

@ -5,7 +5,7 @@ use actix_web::HttpRequest;
use meilisearch_types::InstanceUid; use meilisearch_types::InstanceUid;
use serde_json::Value; use serde_json::Value;
use super::{find_user_id, Analytics, DocumentDeletionKind}; use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery; use crate::routes::tasks::TasksFilterQuery;
use crate::Opt; use crate::Opt;
@ -71,6 +71,8 @@ impl Analytics for MockAnalytics {
_request: &HttpRequest, _request: &HttpRequest,
) { ) {
} }
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {} fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {}
fn health_seen(&self, _request: &HttpRequest) {} fn health_seen(&self, _request: &HttpRequest) {}
} }

View File

@ -67,6 +67,12 @@ pub enum DocumentDeletionKind {
PerFilter, PerFilter,
} }
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum DocumentFetchKind {
PerDocumentId,
Normal { with_filter: bool, limit: usize, offset: usize },
}
pub trait Analytics: Sync + Send { pub trait Analytics: Sync + Send {
fn instance_uid(&self) -> Option<&InstanceUid>; fn instance_uid(&self) -> Option<&InstanceUid>;
@ -90,6 +96,12 @@ pub trait Analytics: Sync + Send {
request: &HttpRequest, request: &HttpRequest,
); );
// this method should be called to aggregate a fetch documents request
fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
// this method should be called to aggregate a fetch documents request
fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
// this method should be called to aggregate a add documents request // this method should be called to aggregate a add documents request
fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest); fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest);

View File

@ -23,7 +23,9 @@ use tokio::select;
use tokio::sync::mpsc::{self, Receiver, Sender}; use tokio::sync::mpsc::{self, Receiver, Sender};
use uuid::Uuid; use uuid::Uuid;
use super::{config_user_id_path, DocumentDeletionKind, MEILISEARCH_CONFIG_PATH}; use super::{
config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH,
};
use crate::analytics::Analytics; use crate::analytics::Analytics;
use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot}; use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot};
use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::documents::UpdateDocumentsQuery;
@ -72,6 +74,8 @@ pub enum AnalyticsMsg {
AggregateAddDocuments(DocumentsAggregator), AggregateAddDocuments(DocumentsAggregator),
AggregateDeleteDocuments(DocumentsDeletionAggregator), AggregateDeleteDocuments(DocumentsDeletionAggregator),
AggregateUpdateDocuments(DocumentsAggregator), AggregateUpdateDocuments(DocumentsAggregator),
AggregateGetFetchDocuments(DocumentsFetchAggregator),
AggregatePostFetchDocuments(DocumentsFetchAggregator),
AggregateTasks(TasksAggregator), AggregateTasks(TasksAggregator),
AggregateHealth(HealthAggregator), AggregateHealth(HealthAggregator),
} }
@ -139,6 +143,8 @@ impl SegmentAnalytics {
add_documents_aggregator: DocumentsAggregator::default(), add_documents_aggregator: DocumentsAggregator::default(),
delete_documents_aggregator: DocumentsDeletionAggregator::default(), delete_documents_aggregator: DocumentsDeletionAggregator::default(),
update_documents_aggregator: DocumentsAggregator::default(), update_documents_aggregator: DocumentsAggregator::default(),
get_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
post_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
get_tasks_aggregator: TasksAggregator::default(), get_tasks_aggregator: TasksAggregator::default(),
health_aggregator: HealthAggregator::default(), health_aggregator: HealthAggregator::default(),
}); });
@ -205,6 +211,16 @@ impl super::Analytics for SegmentAnalytics {
let _ = self.sender.try_send(AnalyticsMsg::AggregateUpdateDocuments(aggregate)); let _ = self.sender.try_send(AnalyticsMsg::AggregateUpdateDocuments(aggregate));
} }
fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) {
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateGetFetchDocuments(aggregate));
}
fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) {
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate));
}
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) { fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) {
let aggregate = TasksAggregator::from_query(query, request); let aggregate = TasksAggregator::from_query(query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate)); let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate));
@ -225,6 +241,7 @@ impl super::Analytics for SegmentAnalytics {
struct Infos { struct Infos {
env: String, env: String,
experimental_enable_metrics: bool, experimental_enable_metrics: bool,
experimental_reduce_indexing_memory_usage: bool,
db_path: bool, db_path: bool,
import_dump: bool, import_dump: bool,
dump_dir: bool, dump_dir: bool,
@ -258,6 +275,7 @@ impl From<Opt> for Infos {
let Opt { let Opt {
db_path, db_path,
experimental_enable_metrics, experimental_enable_metrics,
experimental_reduce_indexing_memory_usage,
http_addr, http_addr,
master_key: _, master_key: _,
env, env,
@ -300,6 +318,7 @@ impl From<Opt> for Infos {
Self { Self {
env, env,
experimental_enable_metrics, experimental_enable_metrics,
experimental_reduce_indexing_memory_usage,
db_path: db_path != PathBuf::from("./data.ms"), db_path: db_path != PathBuf::from("./data.ms"),
import_dump: import_dump.is_some(), import_dump: import_dump.is_some(),
dump_dir: dump_dir != PathBuf::from("dumps/"), dump_dir: dump_dir != PathBuf::from("dumps/"),
@ -338,6 +357,8 @@ pub struct Segment {
add_documents_aggregator: DocumentsAggregator, add_documents_aggregator: DocumentsAggregator,
delete_documents_aggregator: DocumentsDeletionAggregator, delete_documents_aggregator: DocumentsDeletionAggregator,
update_documents_aggregator: DocumentsAggregator, update_documents_aggregator: DocumentsAggregator,
get_fetch_documents_aggregator: DocumentsFetchAggregator,
post_fetch_documents_aggregator: DocumentsFetchAggregator,
get_tasks_aggregator: TasksAggregator, get_tasks_aggregator: TasksAggregator,
health_aggregator: HealthAggregator, health_aggregator: HealthAggregator,
} }
@ -400,6 +421,8 @@ impl Segment {
Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg),
None => (), None => (),
@ -450,6 +473,10 @@ impl Segment {
.into_event(&self.user, "Documents Deleted"); .into_event(&self.user, "Documents Deleted");
let update_documents = std::mem::take(&mut self.update_documents_aggregator) let update_documents = std::mem::take(&mut self.update_documents_aggregator)
.into_event(&self.user, "Documents Updated"); .into_event(&self.user, "Documents Updated");
let get_fetch_documents = std::mem::take(&mut self.get_fetch_documents_aggregator)
.into_event(&self.user, "Documents Fetched GET");
let post_fetch_documents = std::mem::take(&mut self.post_fetch_documents_aggregator)
.into_event(&self.user, "Documents Fetched POST");
let get_tasks = let get_tasks =
std::mem::take(&mut self.get_tasks_aggregator).into_event(&self.user, "Tasks Seen"); std::mem::take(&mut self.get_tasks_aggregator).into_event(&self.user, "Tasks Seen");
let health = let health =
@ -473,6 +500,12 @@ impl Segment {
if let Some(update_documents) = update_documents { if let Some(update_documents) = update_documents {
let _ = self.batcher.push(update_documents).await; let _ = self.batcher.push(update_documents).await;
} }
if let Some(get_fetch_documents) = get_fetch_documents {
let _ = self.batcher.push(get_fetch_documents).await;
}
if let Some(post_fetch_documents) = post_fetch_documents {
let _ = self.batcher.push(post_fetch_documents).await;
}
if let Some(get_tasks) = get_tasks { if let Some(get_tasks) = get_tasks {
let _ = self.batcher.push(get_tasks).await; let _ = self.batcher.push(get_tasks).await;
} }
@ -1135,3 +1168,76 @@ impl HealthAggregator {
}) })
} }
} }
#[derive(Default, Serialize)]
pub struct DocumentsFetchAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
#[serde(rename = "requests.max_limit")]
total_received: usize,
// a call on ../documents/:doc_id
per_document_id: bool,
// if a filter was used
per_filter: bool,
// pagination
#[serde(rename = "pagination.max_limit")]
max_limit: usize,
#[serde(rename = "pagination.max_offset")]
max_offset: usize,
}
impl DocumentsFetchAggregator {
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
let (limit, offset) = match query {
DocumentFetchKind::PerDocumentId => (1, 0),
DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset),
};
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1,
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId),
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
max_limit: limit,
max_offset: offset,
}
}
/// Aggregate one [DocumentsFetchAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
if self.timestamp.is_none() {
self.timestamp = other.timestamp;
}
for user_agent in other.user_agents {
self.user_agents.insert(user_agent);
}
self.total_received = self.total_received.saturating_add(other.total_received);
self.per_document_id |= other.per_document_id;
self.per_filter |= other.per_filter;
self.max_limit = self.max_limit.max(other.max_limit);
self.max_offset = self.max_offset.max(other.max_offset);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}

View File

@ -61,7 +61,7 @@ impl ErrorCode for MeilisearchHttpError {
MeilisearchHttpError::MissingPayload(_) => Code::MissingPayload, MeilisearchHttpError::MissingPayload(_) => Code::MissingPayload,
MeilisearchHttpError::InvalidContentType(_, _) => Code::InvalidContentType, MeilisearchHttpError::InvalidContentType(_, _) => Code::InvalidContentType,
MeilisearchHttpError::DocumentNotFound(_) => Code::DocumentNotFound, MeilisearchHttpError::DocumentNotFound(_) => Code::DocumentNotFound,
MeilisearchHttpError::EmptyFilter => Code::InvalidDocumentDeleteFilter, MeilisearchHttpError::EmptyFilter => Code::InvalidDocumentFilter,
MeilisearchHttpError::InvalidExpression(_, _) => Code::InvalidSearchFilter, MeilisearchHttpError::InvalidExpression(_, _) => Code::InvalidSearchFilter,
MeilisearchHttpError::PayloadTooLarge(_) => Code::PayloadTooLarge, MeilisearchHttpError::PayloadTooLarge(_) => Code::PayloadTooLarge,
MeilisearchHttpError::SwapIndexPayloadWrongLength(_) => Code::InvalidSwapIndexes, MeilisearchHttpError::SwapIndexPayloadWrongLength(_) => Code::InvalidSwapIndexes,

View File

@ -232,6 +232,7 @@ fn open_or_create_database_unchecked(
dumps_path: opt.dump_dir.clone(), dumps_path: opt.dump_dir.clone(),
task_db_size: opt.max_task_db_size.get_bytes() as usize, task_db_size: opt.max_task_db_size.get_bytes() as usize,
index_base_map_size: opt.max_index_size.get_bytes() as usize, index_base_map_size: opt.max_index_size.get_bytes() as usize,
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
indexer_config: (&opt.indexer_options).try_into()?, indexer_config: (&opt.indexer_options).try_into()?,
autobatching_enabled: true, autobatching_enabled: true,
max_number_of_tasks: 1_000_000, max_number_of_tasks: 1_000_000,

View File

@ -29,6 +29,11 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
let (opt, config_read_from) = Opt::try_build()?; let (opt, config_read_from) = Opt::try_build()?;
anyhow::ensure!(
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
);
setup(&opt)?; setup(&opt)?;
match (opt.env.as_ref(), &opt.master_key) { match (opt.env.as_ref(), &opt.master_key) {

View File

@ -4,20 +4,32 @@ use prometheus::{
register_int_gauge_vec, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, register_int_gauge_vec, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec,
}; };
const HTTP_RESPONSE_TIME_CUSTOM_BUCKETS: &[f64; 14] = &[ /// Create evenly distributed buckets
0.0005, 0.0008, 0.00085, 0.0009, 0.00095, 0.001, 0.00105, 0.0011, 0.00115, 0.0012, 0.0015, fn create_buckets() -> [f64; 29] {
0.002, 0.003, 1.0, (0..10)
]; .chain((10..100).step_by(10))
.chain((100..=1000).step_by(100))
.map(|i| i as f64 / 1000.)
.collect::<Vec<_>>()
.try_into()
.unwrap()
}
lazy_static! { lazy_static! {
pub static ref HTTP_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!( pub static ref HTTP_RESPONSE_TIME_CUSTOM_BUCKETS: [f64; 29] = create_buckets();
opts!("http_requests_total", "HTTP requests total"), pub static ref MEILISEARCH_HTTP_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!(
opts!("meilisearch_http_requests_total", "Meilisearch HTTP requests total"),
&["method", "path"] &["method", "path"]
) )
.expect("Can't create a metric"); .expect("Can't create a metric");
pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge =
register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch Db Size In Bytes")) register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes"))
.expect("Can't create a metric"); .expect("Can't create a metric");
pub static ref MEILISEARCH_USED_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!(
"meilisearch_used_db_size_bytes",
"Meilisearch Used DB Size In Bytes"
))
.expect("Can't create a metric");
pub static ref MEILISEARCH_INDEX_COUNT: IntGauge = pub static ref MEILISEARCH_INDEX_COUNT: IntGauge =
register_int_gauge!(opts!("meilisearch_index_count", "Meilisearch Index Count")) register_int_gauge!(opts!("meilisearch_index_count", "Meilisearch Index Count"))
.expect("Can't create a metric"); .expect("Can't create a metric");
@ -26,11 +38,16 @@ lazy_static! {
&["index"] &["index"]
) )
.expect("Can't create a metric"); .expect("Can't create a metric");
pub static ref HTTP_RESPONSE_TIME_SECONDS: HistogramVec = register_histogram_vec!( pub static ref MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS: HistogramVec = register_histogram_vec!(
"http_response_time_seconds", "http_response_time_seconds",
"HTTP response times", "HTTP response times",
&["method", "path"], &["method", "path"],
HTTP_RESPONSE_TIME_CUSTOM_BUCKETS.to_vec() HTTP_RESPONSE_TIME_CUSTOM_BUCKETS.to_vec()
) )
.expect("Can't create a metric"); .expect("Can't create a metric");
pub static ref MEILISEARCH_NB_TASKS: IntGaugeVec = register_int_gauge_vec!(
opts!("meilisearch_nb_tasks", "Meilisearch Number of tasks"),
&["kind", "value"]
)
.expect("Can't create a metric");
} }

View File

@ -52,11 +52,11 @@ where
if is_registered_resource { if is_registered_resource {
let request_method = req.method().to_string(); let request_method = req.method().to_string();
histogram_timer = Some( histogram_timer = Some(
crate::metrics::HTTP_RESPONSE_TIME_SECONDS crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path]) .with_label_values(&[&request_method, request_path])
.start_timer(), .start_timer(),
); );
crate::metrics::HTTP_REQUESTS_TOTAL crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
.with_label_values(&[&request_method, request_path]) .with_label_values(&[&request_method, request_path])
.inc(); .inc();
} }

View File

@ -48,6 +48,8 @@ const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS";
const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR";
const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
const DEFAULT_DB_PATH: &str = "./data.ms"; const DEFAULT_DB_PATH: &str = "./data.ms";
@ -293,6 +295,11 @@ pub struct Opt {
#[serde(default)] #[serde(default)]
pub experimental_enable_metrics: bool, pub experimental_enable_metrics: bool,
/// Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
#[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)]
#[serde(default)]
pub experimental_reduce_indexing_memory_usage: bool,
#[serde(flatten)] #[serde(flatten)]
#[clap(flatten)] #[clap(flatten)]
pub indexer_options: IndexerOpts, pub indexer_options: IndexerOpts,
@ -385,6 +392,7 @@ impl Opt {
#[cfg(all(not(debug_assertions), feature = "analytics"))] #[cfg(all(not(debug_assertions), feature = "analytics"))]
no_analytics, no_analytics,
experimental_enable_metrics: enable_metrics_route, experimental_enable_metrics: enable_metrics_route,
experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
} = self; } = self;
export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_DB_PATH, db_path);
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@ -426,6 +434,10 @@ impl Opt {
MEILI_EXPERIMENTAL_ENABLE_METRICS, MEILI_EXPERIMENTAL_ENABLE_METRICS,
enable_metrics_route.to_string(), enable_metrics_route.to_string(),
); );
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
reduce_indexing_memory_usage.to_string(),
);
indexer_options.export_to_env(); indexer_options.export_to_env();
} }

View File

@ -29,7 +29,7 @@ use tempfile::tempfile;
use tokio::fs::File; use tokio::fs::File;
use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter};
use crate::analytics::{Analytics, DocumentDeletionKind}; use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::error::PayloadError::ReceivePayload; use crate::error::PayloadError::ReceivePayload;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
@ -97,10 +97,14 @@ pub async fn get_document(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
document_param: web::Path<DocumentParam>, document_param: web::Path<DocumentParam>,
params: AwebQueryParameter<GetDocument, DeserrQueryParamError>, params: AwebQueryParameter<GetDocument, DeserrQueryParamError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let DocumentParam { index_uid, document_id } = document_param.into_inner(); let DocumentParam { index_uid, document_id } = document_param.into_inner();
let index_uid = IndexUid::try_from(index_uid)?; let index_uid = IndexUid::try_from(index_uid)?;
analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req);
let GetDocument { fields } = params.into_inner(); let GetDocument { fields } = params.into_inner();
let attributes_to_retrieve = fields.merge_star_and_none(); let attributes_to_retrieve = fields.merge_star_and_none();
@ -161,16 +165,31 @@ pub async fn documents_by_query_post(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
body: AwebJson<BrowseQuery, DeserrJsonError>, body: AwebJson<BrowseQuery, DeserrJsonError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!("called with body: {:?}", body); debug!("called with body: {:?}", body);
documents_by_query(&index_scheduler, index_uid, body.into_inner()) let body = body.into_inner();
analytics.post_fetch_documents(
&DocumentFetchKind::Normal {
with_filter: body.filter.is_some(),
limit: body.limit,
offset: body.offset,
},
&req,
);
documents_by_query(&index_scheduler, index_uid, body)
} }
pub async fn get_documents( pub async fn get_documents(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>, params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!("called with params: {:?}", params); debug!("called with params: {:?}", params);
@ -191,6 +210,15 @@ pub async fn get_documents(
filter, filter,
}; };
analytics.get_fetch_documents(
&DocumentFetchKind::Normal {
with_filter: query.filter.is_some(),
limit: query.limit,
offset: query.offset,
},
&req,
);
documents_by_query(&index_scheduler, index_uid, query) documents_by_query(&index_scheduler, index_uid, query)
} }
@ -458,7 +486,7 @@ pub async fn delete_documents_batch(
#[derive(Debug, Deserr)] #[derive(Debug, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct DocumentDeletionByFilter { pub struct DocumentDeletionByFilter {
#[deserr(error = DeserrJsonError<InvalidDocumentDeleteFilter>)] #[deserr(error = DeserrJsonError<InvalidDocumentFilter>, missing_field_error = DeserrJsonError::missing_document_filter)]
filter: Value, filter: Value,
} }
@ -480,8 +508,8 @@ pub async fn delete_documents_by_filter(
|| -> Result<_, ResponseError> { || -> Result<_, ResponseError> {
Ok(crate::search::parse_filter(&filter)?.ok_or(MeilisearchHttpError::EmptyFilter)?) Ok(crate::search::parse_filter(&filter)?.ok_or(MeilisearchHttpError::EmptyFilter)?)
}() }()
// and whatever was the error, the error code should always be an InvalidDocumentDeleteFilter // and whatever was the error, the error code should always be an InvalidDocumentFilter
.map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentDeleteFilter))?; .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?;
let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter };
let task: SummarizedTaskView = let task: SummarizedTaskView =
@ -540,7 +568,12 @@ fn retrieve_documents<S: AsRef<str>>(
}; };
let candidates = if let Some(filter) = filter { let candidates = if let Some(filter) = filter {
filter.evaluate(&rtxn, index)? filter.evaluate(&rtxn, index).map_err(|err| match err {
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter)
}
e => e.into(),
})?
} else { } else {
index.documents_ids(&rtxn)? index.documents_ids(&rtxn)?
}; };

View File

@ -17,7 +17,7 @@ pub fn configure(config: &mut web::ServiceConfig) {
pub async fn get_metrics( pub async fn get_metrics(
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<AuthController>>, auth_controller: Data<AuthController>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let auth_filters = index_scheduler.filters(); let auth_filters = index_scheduler.filters();
if !auth_filters.all_indexes_authorized() { if !auth_filters.all_indexes_authorized() {
@ -28,10 +28,10 @@ pub async fn get_metrics(
return Err(error); return Err(error);
} }
let response = let response = create_all_stats((*index_scheduler).clone(), auth_controller, auth_filters)?;
create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), auth_filters)?;
crate::metrics::MEILISEARCH_DB_SIZE_BYTES.set(response.database_size as i64); crate::metrics::MEILISEARCH_DB_SIZE_BYTES.set(response.database_size as i64);
crate::metrics::MEILISEARCH_USED_DB_SIZE_BYTES.set(response.used_database_size as i64);
crate::metrics::MEILISEARCH_INDEX_COUNT.set(response.indexes.len() as i64); crate::metrics::MEILISEARCH_INDEX_COUNT.set(response.indexes.len() as i64);
for (index, value) in response.indexes.iter() { for (index, value) in response.indexes.iter() {
@ -40,6 +40,14 @@ pub async fn get_metrics(
.set(value.number_of_documents as i64); .set(value.number_of_documents as i64);
} }
for (kind, value) in index_scheduler.get_stats()? {
for (value, count) in value {
crate::metrics::MEILISEARCH_NB_TASKS
.with_label_values(&[&kind, &value])
.set(count as i64);
}
}
let encoder = TextEncoder::new(); let encoder = TextEncoder::new();
let mut buffer = vec![]; let mut buffer = vec![];
encoder.encode(&prometheus::gather(), &mut buffer).expect("Failed to encode metrics"); encoder.encode(&prometheus::gather(), &mut buffer).expect("Failed to encode metrics");

View File

@ -231,6 +231,8 @@ pub async fn running() -> HttpResponse {
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct Stats { pub struct Stats {
pub database_size: u64, pub database_size: u64,
#[serde(skip)]
pub used_database_size: u64,
#[serde(serialize_with = "time::serde::rfc3339::option::serialize")] #[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
pub last_update: Option<OffsetDateTime>, pub last_update: Option<OffsetDateTime>,
pub indexes: BTreeMap<String, indexes::IndexStats>, pub indexes: BTreeMap<String, indexes::IndexStats>,
@ -259,6 +261,7 @@ pub fn create_all_stats(
let mut last_task: Option<OffsetDateTime> = None; let mut last_task: Option<OffsetDateTime> = None;
let mut indexes = BTreeMap::new(); let mut indexes = BTreeMap::new();
let mut database_size = 0; let mut database_size = 0;
let mut used_database_size = 0;
for index_uid in index_scheduler.index_names()? { for index_uid in index_scheduler.index_names()? {
// Accumulate the size of all indexes, even unauthorized ones, so // Accumulate the size of all indexes, even unauthorized ones, so
@ -266,6 +269,7 @@ pub fn create_all_stats(
// See <https://github.com/meilisearch/meilisearch/pull/3541#discussion_r1126747643> for context. // See <https://github.com/meilisearch/meilisearch/pull/3541#discussion_r1126747643> for context.
let stats = index_scheduler.index_stats(&index_uid)?; let stats = index_scheduler.index_stats(&index_uid)?;
database_size += stats.inner_stats.database_size; database_size += stats.inner_stats.database_size;
used_database_size += stats.inner_stats.used_database_size;
if !filters.is_index_authorized(&index_uid) { if !filters.is_index_authorized(&index_uid) {
continue; continue;
@ -278,10 +282,14 @@ pub fn create_all_stats(
} }
database_size += index_scheduler.size()?; database_size += index_scheduler.size()?;
used_database_size += index_scheduler.used_size()?;
database_size += auth_controller.size()?; database_size += auth_controller.size()?;
database_size += index_scheduler.compute_update_file_size()?; used_database_size += auth_controller.used_size()?;
let update_file_size = index_scheduler.compute_update_file_size()?;
database_size += update_file_size;
used_database_size += update_file_size;
let stats = Stats { database_size, last_update: last_task, indexes }; let stats = Stats { database_size, used_database_size, last_update: last_task, indexes };
Ok(stats) Ok(stats)
} }

View File

@ -99,7 +99,7 @@ pub struct DetailsView {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub deleted_tasks: Option<Option<u64>>, pub deleted_tasks: Option<Option<u64>>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub original_filter: Option<String>, pub original_filter: Option<Option<String>>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub dump_uid: Option<Option<String>>, pub dump_uid: Option<Option<String>>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
@ -131,12 +131,13 @@ impl From<Details> for DetailsView {
} => DetailsView { } => DetailsView {
provided_ids: Some(received_document_ids), provided_ids: Some(received_document_ids),
deleted_documents: Some(deleted_documents), deleted_documents: Some(deleted_documents),
original_filter: Some(None),
..DetailsView::default() ..DetailsView::default()
}, },
Details::DocumentDeletionByFilter { original_filter, deleted_documents } => { Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
DetailsView { DetailsView {
provided_ids: Some(0), provided_ids: Some(0),
original_filter: Some(original_filter), original_filter: Some(Some(original_filter)),
deleted_documents: Some(deleted_documents), deleted_documents: Some(deleted_documents),
..DetailsView::default() ..DetailsView::default()
} }
@ -148,7 +149,7 @@ impl From<Details> for DetailsView {
DetailsView { DetailsView {
matched_tasks: Some(matched_tasks), matched_tasks: Some(matched_tasks),
canceled_tasks: Some(canceled_tasks), canceled_tasks: Some(canceled_tasks),
original_filter: Some(original_filter), original_filter: Some(Some(original_filter)),
..DetailsView::default() ..DetailsView::default()
} }
} }
@ -156,7 +157,7 @@ impl From<Details> for DetailsView {
DetailsView { DetailsView {
matched_tasks: Some(matched_tasks), matched_tasks: Some(matched_tasks),
deleted_tasks: Some(deleted_tasks), deleted_tasks: Some(deleted_tasks),
original_filter: Some(original_filter), original_filter: Some(Some(original_filter)),
..DetailsView::default() ..DetailsView::default()
} }
} }
@ -729,7 +730,7 @@ mod tests {
let err = deserr_query_params::<TaskDeletionOrCancelationQuery>(params).unwrap_err(); let err = deserr_query_params::<TaskDeletionOrCancelationQuery>(params).unwrap_err();
snapshot!(meili_snap::json_string!(err), @r###" snapshot!(meili_snap::json_string!(err), @r###"
{ {
"message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `documentDeletionByFilter`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
"code": "invalid_task_types", "code": "invalid_task_types",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_task_types" "link": "https://docs.meilisearch.com/errors#invalid_task_types"

View File

@ -16,8 +16,11 @@ pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'
("GET", "/indexes/products/search") => hashset!{"search", "*"}, ("GET", "/indexes/products/search") => hashset!{"search", "*"},
("POST", "/indexes/products/documents") => hashset!{"documents.add", "documents.*", "*"}, ("POST", "/indexes/products/documents") => hashset!{"documents.add", "documents.*", "*"},
("GET", "/indexes/products/documents") => hashset!{"documents.get", "documents.*", "*"}, ("GET", "/indexes/products/documents") => hashset!{"documents.get", "documents.*", "*"},
("POST", "/indexes/products/documents/fetch") => hashset!{"documents.get", "documents.*", "*"},
("GET", "/indexes/products/documents/0") => hashset!{"documents.get", "documents.*", "*"}, ("GET", "/indexes/products/documents/0") => hashset!{"documents.get", "documents.*", "*"},
("DELETE", "/indexes/products/documents/0") => hashset!{"documents.delete", "documents.*", "*"}, ("DELETE", "/indexes/products/documents/0") => hashset!{"documents.delete", "documents.*", "*"},
("POST", "/indexes/products/documents/delete-batch") => hashset!{"documents.delete", "documents.*", "*"},
("POST", "/indexes/products/documents/delete") => hashset!{"documents.delete", "documents.*", "*"},
("GET", "/tasks") => hashset!{"tasks.get", "tasks.*", "*"}, ("GET", "/tasks") => hashset!{"tasks.get", "tasks.*", "*"},
("DELETE", "/tasks") => hashset!{"tasks.delete", "tasks.*", "*"}, ("DELETE", "/tasks") => hashset!{"tasks.delete", "tasks.*", "*"},
("GET", "/tasks?indexUid=products") => hashset!{"tasks.get", "tasks.*", "*"}, ("GET", "/tasks?indexUid=products") => hashset!{"tasks.get", "tasks.*", "*"},

View File

@ -180,9 +180,9 @@ async fn get_all_documents_bad_filter() {
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo=bernese", "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo=bernese",
"code": "invalid_search_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
} }
"###); "###);
} }
@ -547,9 +547,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Invalid syntax for the filter parameter: `expected String, Array, found: true`.", "message": "Invalid syntax for the filter parameter: `expected String, Array, found: true`.",
"code": "invalid_document_delete_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_delete_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
} }
"###); "###);
@ -559,9 +559,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `hello`.\n1:6 hello", "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `hello`.\n1:6 hello",
"code": "invalid_document_delete_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_delete_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
} }
"###); "###);
@ -571,9 +571,21 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Sending an empty filter is forbidden.", "message": "Sending an empty filter is forbidden.",
"code": "invalid_document_delete_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_delete_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
}
"###);
// do not send any filter
let (response, code) = index.delete_document_by_filter(json!({})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Missing field `filter`",
"code": "missing_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#missing_document_filter"
} }
"###); "###);
@ -630,9 +642,9 @@ async fn delete_document_by_filter() {
}, },
"error": { "error": {
"message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"code": "invalid_search_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
}, },
"duration": "[duration]", "duration": "[duration]",
"enqueuedAt": "[date]", "enqueuedAt": "[date]",
@ -664,9 +676,9 @@ async fn delete_document_by_filter() {
}, },
"error": { "error": {
"message": "Attribute `catto` is not filterable. Available filterable attributes are: `doggo`.\n1:6 catto = jorts", "message": "Attribute `catto` is not filterable. Available filterable attributes are: `doggo`.\n1:6 catto = jorts",
"code": "invalid_search_filter", "code": "invalid_document_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
}, },
"duration": "[duration]", "duration": "[duration]",
"enqueuedAt": "[date]", "enqueuedAt": "[date]",
@ -748,4 +760,27 @@ async fn fetch_document_by_filter() {
"link": "https://docs.meilisearch.com/errors#invalid_document_filter" "link": "https://docs.meilisearch.com/errors#invalid_document_filter"
} }
"###); "###);
let (response, code) = index.get_document_by_filter(json!({ "filter": "cool doggo" })).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `cool doggo`.\n1:11 cool doggo",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"
}
"###);
let (response, code) =
index.get_document_by_filter(json!({ "filter": "doggo = bernese" })).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Attribute `doggo` is not filterable. Available filterable attributes are: `color`.\n1:6 doggo = bernese",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"
}
"###);
} }

View File

@ -946,7 +946,7 @@ async fn sort_unset_ranking_rule() {
index.wait_task(1).await; index.wait_task(1).await;
let expected_response = json!({ let expected_response = json!({
"message": "The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.", "message": "You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.",
"code": "invalid_search_sort", "code": "invalid_search_sort",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort" "link": "https://docs.meilisearch.com/errors#invalid_search_sort"

View File

@ -97,7 +97,7 @@ async fn task_bad_types() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `documentDeletionByFilter`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
"code": "invalid_task_types", "code": "invalid_task_types",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_task_types" "link": "https://docs.meilisearch.com/errors#invalid_task_types"
@ -108,7 +108,7 @@ async fn task_bad_types() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `documentDeletionByFilter`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
"code": "invalid_task_types", "code": "invalid_task_types",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_task_types" "link": "https://docs.meilisearch.com/errors#invalid_task_types"
@ -119,7 +119,7 @@ async fn task_bad_types() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `documentDeletionByFilter`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
"code": "invalid_task_types", "code": "invalid_task_types",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_task_types" "link": "https://docs.meilisearch.com/errors#invalid_task_types"

View File

@ -413,7 +413,7 @@ async fn test_summarized_document_addition_or_update() {
} }
#[actix_web::test] #[actix_web::test]
async fn test_summarized_delete_batch() { async fn test_summarized_delete_documents_by_batch() {
let server = Server::new().await; let server = Server::new().await;
let index = server.index("test"); let index = server.index("test");
index.delete_batch(vec![1, 2, 3]).await; index.delete_batch(vec![1, 2, 3]).await;
@ -430,7 +430,8 @@ async fn test_summarized_delete_batch() {
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"providedIds": 3, "providedIds": 3,
"deletedDocuments": 0 "deletedDocuments": 0,
"originalFilter": null
}, },
"error": { "error": {
"message": "Index `test` not found.", "message": "Index `test` not found.",
@ -460,7 +461,8 @@ async fn test_summarized_delete_batch() {
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"providedIds": 1, "providedIds": 1,
"deletedDocuments": 0 "deletedDocuments": 0,
"originalFilter": null
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",
@ -472,7 +474,100 @@ async fn test_summarized_delete_batch() {
} }
#[actix_web::test] #[actix_web::test]
async fn test_summarized_delete_document() { async fn test_summarized_delete_documents_by_filter() {
let server = Server::new().await;
let index = server.index("test");
index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await;
index.wait_task(0).await;
let (task, _) = index.get_task(0).await;
assert_json_snapshot!(task,
{ ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" },
@r###"
{
"uid": 0,
"indexUid": "test",
"status": "failed",
"type": "documentDeletion",
"canceledBy": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
"originalFilter": "\"doggo = bernese\""
},
"error": {
"message": "Index `test` not found.",
"code": "index_not_found",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#index_not_found"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
index.create(None).await;
index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await;
index.wait_task(2).await;
let (task, _) = index.get_task(2).await;
assert_json_snapshot!(task,
{ ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" },
@r###"
{
"uid": 2,
"indexUid": "test",
"status": "failed",
"type": "documentDeletion",
"canceledBy": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
"originalFilter": "\"doggo = bernese\""
},
"error": {
"message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
index.update_settings(json!({ "filterableAttributes": ["doggo"] })).await;
index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await;
index.wait_task(4).await;
let (task, _) = index.get_task(4).await;
assert_json_snapshot!(task,
{ ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" },
@r###"
{
"uid": 4,
"indexUid": "test",
"status": "succeeded",
"type": "documentDeletion",
"canceledBy": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
"originalFilter": "\"doggo = bernese\""
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_web::test]
async fn test_summarized_delete_document_by_id() {
let server = Server::new().await; let server = Server::new().await;
let index = server.index("test"); let index = server.index("test");
index.delete_document(1).await; index.delete_document(1).await;
@ -489,7 +584,8 @@ async fn test_summarized_delete_document() {
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"providedIds": 1, "providedIds": 1,
"deletedDocuments": 0 "deletedDocuments": 0,
"originalFilter": null
}, },
"error": { "error": {
"message": "Index `test` not found.", "message": "Index `test` not found.",
@ -519,7 +615,8 @@ async fn test_summarized_delete_document() {
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"providedIds": 1, "providedIds": 1,
"deletedDocuments": 0 "deletedDocuments": 0,
"originalFilter": null
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",

View File

@ -25,8 +25,13 @@ flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7" fst = "0.4.7"
fxhash = "0.2.1" fxhash = "0.2.1"
geoutils = "0.5.1" geoutils = "0.5.1"
grenad = { version = "0.4.4", default-features = false, features = ["tempfile"] } grenad = { version = "0.4.4", default-features = false, features = [
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.5", default-features = false, features = ["lmdb", "sync-read-txn"] } "tempfile",
] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-features = false, features = [
"lmdb",
"sync-read-txn",
] }
json-depth-checker = { path = "../json-depth-checker" } json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.5.10" memmap2 = "0.5.10"
@ -39,12 +44,17 @@ rstar = { version = "0.10.0", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] } serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] } serde_json = { version = "1.0.95", features = ["preserve_order"] }
slice-group-by = "0.3.0" slice-group-by = "0.3.0"
smallstr = { version = "0.3.0", features = ["serde"] } smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.10.0" smallvec = "1.10.0"
smartstring = "1.0.1" smartstring = "1.0.1"
tempfile = "3.5.0" tempfile = "3.5.0"
thiserror = "1.0.40" thiserror = "1.0.40"
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] } time = { version = "0.3.20", features = [
"serde-well-known",
"formatting",
"parsing",
"macros",
] }
uuid = { version = "1.3.1", features = ["v4"] } uuid = { version = "1.3.1", features = ["v4"] }
filter-parser = { path = "../filter-parser" } filter-parser = { path = "../filter-parser" }
@ -63,13 +73,10 @@ big_s = "1.0.2"
insta = "1.29.0" insta = "1.29.0"
maplit = "1.0.2" maplit = "1.0.2"
md5 = "0.7.0" md5 = "0.7.0"
rand = {version = "0.8.5", features = ["small_rng"] } rand = { version = "0.8.5", features = ["small_rng"] }
[target.'cfg(fuzzing)'.dev-dependencies]
fuzzcheck = "0.12.1"
[features] [features]
all-tokenizations = [ "charabia/default" ] all-tokenizations = ["charabia/default"]
# Use POSIX semaphores instead of SysV semaphores in LMDB # Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml # For more information on this feature, see heed's Cargo.toml

View File

@ -111,7 +111,6 @@ pub enum Error {
Io(#[from] io::Error), Io(#[from] io::Error),
} }
#[cfg(test)]
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> { pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
let documents = match json { let documents = match json {
object @ serde_json::Value::Object(_) => vec![object], object @ serde_json::Value::Object(_) => vec![object],
@ -141,7 +140,6 @@ macro_rules! documents {
}}; }};
} }
#[cfg(test)]
pub fn documents_batch_reader_from_objects( pub fn documents_batch_reader_from_objects(
objects: impl IntoIterator<Item = Object>, objects: impl IntoIterator<Item = Object>,
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> { ) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {

View File

@ -126,7 +126,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
#[error("{}", HeedError::BadOpenOptions)] #[error("{}", HeedError::BadOpenOptions)]
InvalidLmdbOpenOptions, InvalidLmdbOpenOptions,
#[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] #[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]
SortRankingRuleMissing, SortRankingRuleMissing,
#[error("The database file is in an invalid state.")] #[error("The database file is in an invalid state.")]
InvalidStoreFile, InvalidStoreFile,

View File

@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> {
map map
} }
/// Return an fst of the combined hard and soft deleted ID.
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
if self.soft.is_empty() {
return Ok(Cow::Borrowed(&self.hard));
}
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
}
fn merge_soft_into_hard(&mut self) -> fst::Result<()> { fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
if self.soft.len() >= self.hard.len() / 2 { if self.soft.len() >= self.hard.len() / 2 {
let union_op = self.hard.op().add(&self.soft).r#union(); self.hard = self.to_fst()?.into_owned();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
self.soft = fst::Map::default().map_data(Cow::Owned)?; self.soft = fst::Map::default().map_data(Cow::Owned)?;
} }

View File

@ -49,7 +49,7 @@ impl CboRoaringBitmapCodec {
} else { } else {
// Otherwise, it means we used the classic RoaringBitmapCodec and // Otherwise, it means we used the classic RoaringBitmapCodec and
// that the header takes threshold integers. // that the header takes threshold integers.
RoaringBitmap::deserialize_from(bytes) RoaringBitmap::deserialize_unchecked_from(bytes)
} }
} }
@ -69,7 +69,7 @@ impl CboRoaringBitmapCodec {
vec.push(integer); vec.push(integer);
} }
} else { } else {
roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
} }
} }

View File

@ -8,7 +8,7 @@ impl heed::BytesDecode<'_> for RoaringBitmapCodec {
type DItem = RoaringBitmap; type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_from(bytes).ok() RoaringBitmap::deserialize_unchecked_from(bytes).ok()
} }
} }

View File

@ -21,10 +21,9 @@ use crate::heed_codec::facet::{
}; };
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::{ use crate::{
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
Search, U8StrStrCodec, BEU16, BEU32,
}; };
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -111,9 +110,6 @@ pub struct Index {
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
@ -170,33 +166,45 @@ impl Index {
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
let main = env.create_poly_database(Some(MAIN))?; let mut wtxn = env.write_txn()?;
let word_docids = env.create_database(Some(WORD_DOCIDS))?; let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let exact_word_prefix_docids =
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?; let word_pair_proximity_docids =
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids =
env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
let word_prefix_pair_proximity_docids = let word_prefix_pair_proximity_docids =
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let prefix_word_pair_proximity_docids = let prefix_word_pair_proximity_docids =
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let field_id_word_count_docids =
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; env.create_database(&mut wtxn, Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?; let word_prefix_position_docids =
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; env.create_database(&mut wtxn, Some(WORD_PREFIX_POSITION_DOCIDS))?;
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let word_prefix_fid_docids =
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; env.create_database(&mut wtxn, Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?; let facet_id_string_docids =
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids =
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids =
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids =
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_f64s =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = let field_id_docid_facet_strings =
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let documents = env.create_database(Some(DOCUMENTS))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
Index::set_creation_dates(&env, main, created_at, updated_at)?; Index::set_creation_dates(&env, main, created_at, updated_at)?;
@ -207,7 +215,6 @@ impl Index {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
script_language_docids, script_language_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
@ -1459,9 +1466,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution, db_snap!(index, field_distribution,
@r###" @r###"
age 1 age 1 |
id 2 id 2 |
name 2 name 2 |
"### "###
); );
@ -1479,9 +1486,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution, db_snap!(index, field_distribution,
@r###" @r###"
age 1 age 1 |
id 2 id 2 |
name 2 name 2 |
"### "###
); );
@ -1495,9 +1502,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution, db_snap!(index, field_distribution,
@r###" @r###"
has_dog 1 has_dog 1 |
id 2 id 2 |
name 2 name 2 |
"### "###
); );
} }

View File

@ -5,52 +5,6 @@
#[global_allocator] #[global_allocator]
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
// #[cfg(test)]
// pub mod allocator {
// use std::alloc::{GlobalAlloc, System};
// use std::sync::atomic::{self, AtomicI64};
// #[global_allocator]
// pub static ALLOC: CountingAlloc = CountingAlloc {
// max_resident: AtomicI64::new(0),
// resident: AtomicI64::new(0),
// allocated: AtomicI64::new(0),
// };
// pub struct CountingAlloc {
// pub max_resident: AtomicI64,
// pub resident: AtomicI64,
// pub allocated: AtomicI64,
// }
// unsafe impl GlobalAlloc for CountingAlloc {
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let old_resident =
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let resident = old_resident + layout.size() as i64;
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
// // if layout.size() > 1_000_000 {
// // eprintln!(
// // "allocating {} with new resident size: {resident}",
// // layout.size() / 1_000_000
// // );
// // // let trace = std::backtrace::Backtrace::capture();
// // // let t = trace.to_string();
// // // eprintln!("{t}");
// // }
// System.alloc(layout)
// }
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
// System.dealloc(ptr, layout)
// }
// }
// }
#[macro_use] #[macro_use]
pub mod documents; pub mod documents;

View File

@ -26,7 +26,6 @@ pub fn apply_distinct_rule(
ctx: &mut SearchContext, ctx: &mut SearchContext,
field_id: u16, field_id: u16,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
// TODO: add a universe here, such that the `excluded` are a subset of the universe?
) -> Result<DistinctOutput> { ) -> Result<DistinctOutput> {
let mut excluded = RoaringBitmap::new(); let mut excluded = RoaringBitmap::new();
let mut remaining = RoaringBitmap::new(); let mut remaining = RoaringBitmap::new();

View File

@ -206,7 +206,7 @@ impl State {
)?; )?;
intersection &= &candidates; intersection &= &candidates;
if !intersection.is_empty() { if !intersection.is_empty() {
// TODO: although not really worth it in terms of performance, // Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency // if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index ctx.index

View File

@ -46,7 +46,7 @@ use super::logger::SearchLogger;
use super::query_graph::QueryNode; use super::query_graph::QueryNode;
use super::ranking_rule_graph::{ use super::ranking_rule_graph::{
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph, ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
}; };
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
@ -54,6 +54,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::PathVisitor; use crate::search::new::ranking_rule_graph::PathVisitor;
use crate::{Result, TermsMatchingStrategy}; use crate::{Result, TermsMatchingStrategy};
pub type Words = GraphBasedRankingRule<WordsGraph>;
impl GraphBasedRankingRule<WordsGraph> {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
}
}
pub type Proximity = GraphBasedRankingRule<ProximityGraph>; pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
impl GraphBasedRankingRule<ProximityGraph> { impl GraphBasedRankingRule<ProximityGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self { pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {

View File

@ -32,7 +32,7 @@ impl<T> Interned<T> {
#[derive(Clone)] #[derive(Clone)]
pub struct DedupInterner<T> { pub struct DedupInterner<T> {
stable_store: Vec<T>, stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>, // TODO: Arc lookup: FxHashMap<T, Interned<T>>,
} }
impl<T> Default for DedupInterner<T> { impl<T> Default for DedupInterner<T> {
fn default() -> Self { fn default() -> Self {

View File

@ -1,5 +1,4 @@
/// Maximum number of tokens we consider in a single search. /// Maximum number of tokens we consider in a single search.
// TODO: Loic, find proper value here so we don't overflow the interner.
pub const MAX_TOKEN_COUNT: usize = 1_000; pub const MAX_TOKEN_COUNT: usize = 1_000;
/// Maximum number of prefixes that can be derived from a single word. /// Maximum number of prefixes that can be derived from a single word.

View File

@ -4,7 +4,6 @@ use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::time::Instant; use std::time::Instant;
// use rand::random;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::search::new::interner::Interned; use crate::search::new::interner::Interned;
@ -13,6 +12,7 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::{ use crate::search::new::ranking_rule_graph::{
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition, Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
WordsCondition, WordsGraph,
}; };
use crate::search::new::ranking_rules::BoxRankingRule; use crate::search::new::ranking_rules::BoxRankingRule;
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
@ -24,11 +24,12 @@ pub enum SearchEvents {
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
ExtendResults { new: Vec<u32> }, ExtendResults { new: Vec<u32> },
WordsGraph { query_graph: QueryGraph },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> }, ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> }, ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
TypoGraph { graph: RankingRuleGraph<TypoGraph> }, TypoGraph { graph: RankingRuleGraph<TypoGraph> },
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> }, TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
WordsGraph { graph: RankingRuleGraph<WordsGraph> },
WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
FidGraph { graph: RankingRuleGraph<FidGraph> }, FidGraph { graph: RankingRuleGraph<FidGraph> },
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> }, FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
PositionGraph { graph: RankingRuleGraph<PositionGraph> }, PositionGraph { graph: RankingRuleGraph<PositionGraph> },
@ -139,8 +140,11 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
let Some(location) = self.location.last() else { return }; let Some(location) = self.location.last() else { return };
match location { match location {
Location::Words => { Location::Words => {
if let Some(query_graph) = state.downcast_ref::<QueryGraph>() { if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() }); self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
} }
} }
Location::Typo => { Location::Typo => {
@ -329,7 +333,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
SearchEvents::ExtendResults { new } => { SearchEvents::ExtendResults { new } => {
self.write_extend_results(new)?; self.write_extend_results(new)?;
} }
SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?,
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?, SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::ProximityPaths { paths } => { SearchEvents::ProximityPaths { paths } => {
self.write_rr_graph_paths::<ProximityGraph>(paths)?; self.write_rr_graph_paths::<ProximityGraph>(paths)?;
@ -338,6 +341,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
SearchEvents::TypoPaths { paths } => { SearchEvents::TypoPaths { paths } => {
self.write_rr_graph_paths::<TypoGraph>(paths)?; self.write_rr_graph_paths::<TypoGraph>(paths)?;
} }
SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::WordsPaths { paths } => {
self.write_rr_graph_paths::<WordsGraph>(paths)?;
}
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?, SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::FidPaths { paths } => { SearchEvents::FidPaths { paths } => {
self.write_rr_graph_paths::<FidGraph>(paths)?; self.write_rr_graph_paths::<FidGraph>(paths)?;
@ -455,7 +462,7 @@ fill: \"#B6E2D3\"
shape: class shape: class
max_nbr_typo: {}", max_nbr_typo: {}",
term_subset.description(ctx), term_subset.description(ctx),
term_subset.max_nbr_typos(ctx) term_subset.max_typo_cost(ctx)
)?; )?;
for w in term_subset.all_single_words_except_prefix_db(ctx)? { for w in term_subset.all_single_words_except_prefix_db(ctx)? {
@ -482,13 +489,6 @@ fill: \"#B6E2D3\"
} }
Ok(()) Ok(())
} }
fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
self.make_new_file_for_internal_state_if_needed()?;
self.write_query_graph(&qg)?;
Ok(())
}
fn write_rr_graph<R: RankingRuleGraphTrait>( fn write_rr_graph<R: RankingRuleGraphTrait>(
&mut self, &mut self,
graph: &RankingRuleGraph<R>, graph: &RankingRuleGraph<R>,

View File

@ -15,11 +15,7 @@ mod resolve_query_graph;
mod small_bitmap; mod small_bitmap;
mod exact_attribute; mod exact_attribute;
// TODO: documentation + comments
// implementation is currently an adaptation of the previous implementation to fit with the new model
mod sort; mod sort;
// TODO: documentation + comments
mod words;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
@ -43,10 +39,10 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use sort::Sort; use sort::Sort;
use words::Words;
use self::geo_sort::GeoSort; use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy; pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned; use self::interner::Interned;
use crate::search::new::distinct::apply_distinct_rule; use crate::search::new::distinct::apply_distinct_rule;
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
@ -202,6 +198,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
let mut sorted_fields = HashSet::new(); let mut sorted_fields = HashSet::new();
let mut geo_sorted = false; let mut geo_sorted = false;
// Don't add the `words` ranking rule if the term matching strategy is `All`
if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
words = true;
}
let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![]; let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules { for rr in settings_ranking_rules {
@ -397,8 +398,8 @@ pub fn execute_search(
None None
}; };
let bucket_sort_output = if let Some(query_terms) = query_terms { let bucket_sort_output = if let Some(query_terms) = query_terms {
let graph = QueryGraph::from_query(ctx, &query_terms)?; let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(query_terms); located_query_terms = Some(new_located_query_terms);
let ranking_rules = get_ranking_rules_for_query_graph_search( let ranking_rules = get_ranking_rules_for_query_graph_search(
ctx, ctx,

View File

@ -88,29 +88,33 @@ pub struct QueryGraph {
} }
impl QueryGraph { impl QueryGraph {
/// Build the query graph from the parsed user search query. /// Build the query graph from the parsed user search query, return an updated list of the located query terms
/// which contains ngrams.
pub fn from_query( pub fn from_query(
ctx: &mut SearchContext, ctx: &mut SearchContext,
// NOTE: the terms here must be consecutive // The terms here must be consecutive
terms: &[LocatedQueryTerm], terms: &[LocatedQueryTerm],
) -> Result<QueryGraph> { ) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
let mut new_located_query_terms = terms.to_vec();
let nbr_typos = number_of_typos_allowed(ctx)?; let nbr_typos = number_of_typos_allowed(ctx)?;
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End]; let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
let root_node = 0; let root_node = 0;
let end_node = 1; let end_node = 1;
// TODO: we could consider generalizing to 4,5,6,7,etc. ngrams // Ee could consider generalizing to 4,5,6,7,etc. ngrams
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) = let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
(vec![], vec![], vec![root_node]); (vec![], vec![], vec![root_node]);
let original_terms_len = terms.len(); let original_terms_len = terms.len();
for term_idx in 0..original_terms_len { for term_idx in 0..original_terms_len {
let mut new_nodes = vec![]; let mut new_nodes = vec![];
let new_node_idx = add_node( let new_node_idx = add_node(
&mut nodes_data, &mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset { QueryNodeData::Term(LocatedQueryTermSubset {
term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), term_subset: QueryTermSubset::full(terms[term_idx].value),
positions: terms[term_idx].positions.clone(), positions: terms[term_idx].positions.clone(),
term_ids: term_idx as u8..=term_idx as u8, term_ids: term_idx as u8..=term_idx as u8,
}), }),
@ -121,6 +125,7 @@ impl QueryGraph {
if let Some(ngram) = if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)? query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
{ {
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node( let ngram_idx = add_node(
&mut nodes_data, &mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset { QueryNodeData::Term(LocatedQueryTermSubset {
@ -136,6 +141,7 @@ impl QueryGraph {
if let Some(ngram) = if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)? query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
{ {
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node( let ngram_idx = add_node(
&mut nodes_data, &mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset { QueryNodeData::Term(LocatedQueryTermSubset {
@ -167,7 +173,7 @@ impl QueryGraph {
let mut graph = QueryGraph { root_node, end_node, nodes }; let mut graph = QueryGraph { root_node, end_node, nodes };
graph.build_initial_edges(); graph.build_initial_edges();
Ok(graph) Ok((graph, new_located_query_terms))
} }
/// Remove the given nodes, connecting all their predecessors to all their successors. /// Remove the given nodes, connecting all their predecessors to all their successors.

View File

@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
assert!(s.two_typo.is_uninit()); assert!(s.two_typo.is_uninit());
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
self.initialize_one_typo_subterm(ctx)?; self.initialize_one_typo_subterm(ctx)?;
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
assert!(s.one_typo.is_init()); assert!(s.one_typo.is_init());
s.two_typo = Lazy::Init(TwoTypoTerm::default()); s.two_typo = Lazy::Init(TwoTypoTerm::default());
} else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
assert!(s.two_typo.is_uninit()); assert!(s.two_typo.is_uninit());
self.initialize_one_and_two_typo_subterm(ctx)?; self.initialize_one_and_two_typo_subterm(ctx)?;
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
original: ctx.word_interner.insert(word.to_owned()), original: ctx.word_interner.insert(word.to_owned()),
ngram_words: None, ngram_words: None,
is_prefix: false, is_prefix: false,
max_nbr_typos: 0, max_levenshtein_distance: 0,
zero_typo: <_>::default(), zero_typo: <_>::default(),
one_typo: Lazy::Init(<_>::default()), one_typo: Lazy::Init(<_>::default()),
two_typo: Lazy::Init(<_>::default()), two_typo: Lazy::Init(<_>::default()),
@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
Ok(QueryTerm { Ok(QueryTerm {
original: word_interned, original: word_interned,
ngram_words: None, ngram_words: None,
max_nbr_typos: max_typo, max_levenshtein_distance: max_typo,
is_prefix, is_prefix,
zero_typo, zero_typo,
one_typo: Lazy::Uninit, one_typo: Lazy::Uninit,
@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);
let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
let allows_split_words = self_mut.allows_split_words();
let QueryTerm {
original,
is_prefix,
one_typo,
max_levenshtein_distance: max_nbr_typos,
..
} = self_mut;
let original = *original; let original = *original;
let is_prefix = *is_prefix; let is_prefix = *is_prefix;
// let original_str = ctx.word_interner.get(*original).to_owned(); // let original_str = ctx.word_interner.get(*original).to_owned();
@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
})?; })?;
} }
let original_str = ctx.word_interner.get(original).to_owned(); let split_words = if allows_split_words {
let split_words = find_split_words(ctx, original_str.as_str())?; let original_str = ctx.word_interner.get(original).to_owned();
find_split_words(ctx, original_str.as_str())?
} else {
None
};
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);
// Only add the split words to the derivations if: // Only add the split words to the derivations if:
// 1. the term is not an ngram; OR // 1. the term is neither an ngram nor a phrase; OR
// 2. the term is an ngram, but the split words are different from the ngram's component words // 2. the term is an ngram, but the split words are different from the ngram's component words
let split_words = if let Some((ngram_words, split_words)) = let split_words = if let Some((ngram_words, split_words)) =
self_mut.ngram_words.as_ref().zip(split_words.as_ref()) self_mut.ngram_words.as_ref().zip(split_words.as_ref())
@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
} }
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);
let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; let QueryTerm {
original,
is_prefix,
two_typo,
max_levenshtein_distance: max_nbr_typos,
..
} = self_mut;
let original_str = ctx.word_interner.get(*original).to_owned(); let original_str = ctx.word_interner.get(*original).to_owned();
if two_typo.is_init() { if two_typo.is_init() {
return Ok(()); return Ok(());

View File

@ -43,7 +43,7 @@ pub struct QueryTermSubset {
pub struct QueryTerm { pub struct QueryTerm {
original: Interned<String>, original: Interned<String>,
ngram_words: Option<Vec<Interned<String>>>, ngram_words: Option<Vec<Interned<String>>>,
max_nbr_typos: u8, max_levenshtein_distance: u8,
is_prefix: bool, is_prefix: bool,
zero_typo: ZeroTypoTerm, zero_typo: ZeroTypoTerm,
// May not be computed yet // May not be computed yet
@ -132,7 +132,6 @@ impl QueryTermSubset {
if full_query_term.ngram_words.is_some() { if full_query_term.ngram_words.is_some() {
return None; return None;
} }
// TODO: included in subset
if let Some(phrase) = full_query_term.zero_typo.phrase { if let Some(phrase) = full_query_term.zero_typo.phrase {
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
} else if let Some(word) = full_query_term.zero_typo.exact { } else if let Some(word) = full_query_term.zero_typo.exact {
@ -182,7 +181,6 @@ impl QueryTermSubset {
let word = match &self.zero_typo_subset { let word = match &self.zero_typo_subset {
NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::All => Some(use_prefix_db),
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
// TODO: use a subset of prefix words instead
if words.contains(&use_prefix_db) { if words.contains(&use_prefix_db) {
Some(use_prefix_db) Some(use_prefix_db)
} else { } else {
@ -204,7 +202,6 @@ impl QueryTermSubset {
ctx: &mut SearchContext, ctx: &mut SearchContext,
) -> Result<BTreeSet<Word>> { ) -> Result<BTreeSet<Word>> {
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
// TODO: a compute_partially funtion
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
self.original.compute_fully_if_needed(ctx)?; self.original.compute_fully_if_needed(ctx)?;
} }
@ -300,7 +297,6 @@ impl QueryTermSubset {
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() {
// TODO: compute less than fully if possible
self.original.compute_fully_if_needed(ctx)?; self.original.compute_fully_if_needed(ctx)?;
} }
let original = ctx.term_interner.get_mut(self.original); let original = ctx.term_interner.get_mut(self.original);
@ -342,10 +338,16 @@ impl QueryTermSubset {
} }
None None
} }
pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
let t = ctx.term_interner.get(self.original); let t = ctx.term_interner.get(self.original);
match t.max_nbr_typos { match t.max_levenshtein_distance {
0 => 0, 0 => {
if t.allows_split_words() {
1
} else {
0
}
}
1 => { 1 => {
if self.one_typo_subset.is_empty() { if self.one_typo_subset.is_empty() {
0 0
@ -438,6 +440,9 @@ impl QueryTerm {
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
} }
fn allows_split_words(&self) -> bool {
self.zero_typo.phrase.is_none()
}
} }
impl Interned<QueryTerm> { impl Interned<QueryTerm> {

View File

@ -77,13 +77,9 @@ pub fn located_query_terms_from_tokens(
} }
} }
TokenKind::Separator(separator_kind) => { TokenKind::Separator(separator_kind) => {
match separator_kind { // add penalty for hard separators
SeparatorKind::Hard => { if let SeparatorKind::Hard = separator_kind {
position += 1; position = position.wrapping_add(7);
}
SeparatorKind::Soft => {
position += 0;
}
} }
phrase = 'phrase: { phrase = 'phrase: {
@ -143,7 +139,6 @@ pub fn number_of_typos_allowed<'ctx>(
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
let exact_words = ctx.index.exact_words(ctx.txn)?; let exact_words = ctx.index.exact_words(ctx.txn)?;
Ok(Box::new(move |word: &str| { Ok(Box::new(move |word: &str| {
@ -217,7 +212,7 @@ pub fn make_ngram(
original: ngram_str_interned, original: ngram_str_interned,
ngram_words: Some(words_interned), ngram_words: Some(words_interned),
is_prefix, is_prefix,
max_nbr_typos, max_levenshtein_distance: max_nbr_typos,
zero_typo: term.zero_typo, zero_typo: term.zero_typo,
one_typo: Lazy::Uninit, one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit, two_typo: Lazy::Uninit,
@ -254,8 +249,6 @@ impl PhraseBuilder {
} else { } else {
// token has kind Word // token has kind Word
let word = ctx.word_interner.insert(token.lemma().to_string()); let word = ctx.word_interner.insert(token.lemma().to_string());
// TODO: in a phrase, check that every word exists
// otherwise return an empty term
self.words.push(Some(word)); self.words.push(Some(word));
} }
} }
@ -271,7 +264,7 @@ impl PhraseBuilder {
QueryTerm { QueryTerm {
original: ctx.word_interner.insert(phrase_desc), original: ctx.word_interner.insert(phrase_desc),
ngram_words: None, ngram_words: None,
max_nbr_typos: 0, max_levenshtein_distance: 0,
is_prefix: false, is_prefix: false,
zero_typo: ZeroTypoTerm { zero_typo: ZeroTypoTerm {
phrase: Some(phrase), phrase: Some(phrase),
@ -288,3 +281,36 @@ impl PhraseBuilder {
}) })
} }
} }
#[cfg(test)]
mod tests {
use charabia::TokenizerBuilder;
use super::*;
use crate::index::tests::TempIndex;
fn temp_index_with_documents() -> TempIndex {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
]))
.unwrap();
temp_index
}
#[test]
fn start_with_hard_separator() -> Result<()> {
let tokenizer = TokenizerBuilder::new().build();
let tokens = tokenizer.tokenize(".");
let index = temp_index_with_documents();
let rtxn = index.read_txn()?;
let mut ctx = SearchContext::new(&index, &rtxn);
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
assert!(located_query_terms.is_empty());
Ok(())
}
}

View File

@ -1,5 +1,48 @@
#![allow(clippy::too_many_arguments)] /** Implements a "PathVisitor" which finds all paths of a certain cost
from the START to END node of a ranking rule graph.
A path is a list of conditions. A condition is the data associated with
an edge, given by the ranking rule. Some edges don't have a condition associated
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
The algorithm uses a depth-first search. It benefits from two main optimisations:
- The list of all possible costs to go from any node to the END node is precomputed
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
untraversable depending on what other edges were selected.
These two optimisations are meant to avoid traversing edges that wouldn't lead
to a valid path. In practically all cases, we avoid the exponential complexity
that is inherent to depth-first search in a large ranking rule graph.
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
conditions to a list of traversed conditions.
For example, the DeadEndsCache could say the following:
- Immediately, from the start, the conditions `[a,b]` are forbidden
- if we take the condition `c`, then the conditions `[e]` are also forbidden
- and if after that, we take `f`, then `[h,i]` are also forbidden
- etc.
- if we take `g`, then `[f]` is also forbidden
- etc.
- etc.
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
When a path is found from START to END, we give it to the `visit` closure.
This closure takes a mutable reference to the `DeadEndsCache`. This means that
the caller can update this cache. Therefore, we must handle the case where the
DeadEndsCache has been updated. This means potentially backtracking up to the point
where the traversed conditions are all allowed by the new DeadEndsCache.
The algorithm also implements the `TermsMatchingStrategy` logic.
Some edges are augmented with a list of "nodes_to_skip". Skipping
a node means "reaching this node through an unconditional edge". If we have
already traversed (ie. not skipped) a node that is in this list, then we know that we
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
future node that was present in the "nodes_to_skip" list.
The caller can decide to stop the path finding algorithm
by returning a `ControlFlow::Break` from the `visit` closure.
*/
use std::collections::{BTreeSet, VecDeque}; use std::collections::{BTreeSet, VecDeque};
use std::iter::FromIterator; use std::iter::FromIterator;
use std::ops::ControlFlow; use std::ops::ControlFlow;
@ -12,30 +55,41 @@ use crate::search::new::query_graph::QueryNode;
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
use crate::Result; use crate::Result;
/// Closure which processes a path found by the `PathVisitor`
type VisitFn<'f, G> = &'f mut dyn FnMut( type VisitFn<'f, G> = &'f mut dyn FnMut(
// the path as a list of conditions
&[Interned<<G as RankingRuleGraphTrait>::Condition>], &[Interned<<G as RankingRuleGraphTrait>::Condition>],
&mut RankingRuleGraph<G>, &mut RankingRuleGraph<G>,
// a mutable reference to the DeadEndsCache, to update it in case the given
// path doesn't resolve to any valid document ids
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>, &mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
) -> Result<ControlFlow<()>>; ) -> Result<ControlFlow<()>>;
/// A structure which is kept but not updated during the traversal of the graph.
/// It can however be updated by the `visit` closure once a valid path has been found.
struct VisitorContext<'a, G: RankingRuleGraphTrait> { struct VisitorContext<'a, G: RankingRuleGraphTrait> {
graph: &'a mut RankingRuleGraph<G>, graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>, all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>, dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
} }
/// The internal state of the traversal algorithm
struct VisitorState<G: RankingRuleGraphTrait> { struct VisitorState<G: RankingRuleGraphTrait> {
/// Budget from the current node to the end node
remaining_cost: u64, remaining_cost: u64,
/// Previously visited conditions, in order.
path: Vec<Interned<G::Condition>>, path: Vec<Interned<G::Condition>>,
/// Previously visited conditions, as an efficient and compact set.
visited_conditions: SmallBitmap<G::Condition>, visited_conditions: SmallBitmap<G::Condition>,
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
visited_nodes: SmallBitmap<QueryNode>, visited_nodes: SmallBitmap<QueryNode>,
/// The conditions that cannot be visited anymore
forbidden_conditions: SmallBitmap<G::Condition>, forbidden_conditions: SmallBitmap<G::Condition>,
forbidden_conditions_to_nodes: SmallBitmap<QueryNode>, /// The nodes that cannot be visited anymore (they must be skipped)
nodes_to_skip: SmallBitmap<QueryNode>,
} }
/// See module documentation
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> { pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
state: VisitorState<G>, state: VisitorState<G>,
ctx: VisitorContext<'a, G>, ctx: VisitorContext<'a, G>,
@ -56,14 +110,13 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
forbidden_conditions: SmallBitmap::for_interned_values_in( forbidden_conditions: SmallBitmap::for_interned_values_in(
&graph.conditions_interner, &graph.conditions_interner,
), ),
forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in( nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
&graph.query_graph.nodes,
),
}, },
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache }, ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
} }
} }
/// See module documentation
pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> { pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> {
let _ = let _ =
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?; self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
@ -72,22 +125,31 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
} }
impl<G: RankingRuleGraphTrait> VisitorState<G> { impl<G: RankingRuleGraphTrait> VisitorState<G> {
/// Visits a node: traverse all its valid conditional and unconditional edges.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_node( fn visit_node(
&mut self, &mut self,
from_node: Interned<QueryNode>, from_node: Interned<QueryNode>,
visit: VisitFn<G>, visit: VisitFn<G>,
ctx: &mut VisitorContext<G>, ctx: &mut VisitorContext<G>,
) -> Result<ControlFlow<(), bool>> { ) -> Result<ControlFlow<(), bool>> {
// any valid path will be found from this point
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
// and we will need to do more work to potentially backtrack
let mut any_valid = false; let mut any_valid = false;
let edges = ctx.graph.edges_of_node.get(from_node).clone(); let edges = ctx.graph.edges_of_node.get(from_node).clone();
for edge_idx in edges.iter() { for edge_idx in edges.iter() {
// could be none if the edge was deleted
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue }; let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
if self.remaining_cost < edge.cost as u64 { if self.remaining_cost < edge.cost as u64 {
continue; continue;
} }
self.remaining_cost -= edge.cost as u64; self.remaining_cost -= edge.cost as u64;
let cf = match edge.condition { let cf = match edge.condition {
Some(condition) => self.visit_condition( Some(condition) => self.visit_condition(
condition, condition,
@ -119,6 +181,10 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
Ok(ControlFlow::Continue(any_valid)) Ok(ControlFlow::Continue(any_valid))
} }
/// Visits an unconditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_no_condition( fn visit_no_condition(
&mut self, &mut self,
dest_node: Interned<QueryNode>, dest_node: Interned<QueryNode>,
@ -134,20 +200,29 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
{ {
return Ok(ControlFlow::Continue(false)); return Ok(ControlFlow::Continue(false));
} }
// We've reached the END node!
if dest_node == ctx.graph.query_graph.end_node { if dest_node == ctx.graph.query_graph.end_node {
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?; let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
// We could change the return type of the visit closure such that the caller
// tells us whether the dead ends cache was updated or not.
// Alternatively, maybe the DeadEndsCache should have a generation number
// to it, so that we don't need to play with these booleans at all.
match control_flow { match control_flow {
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)), ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
ControlFlow::Break(_) => Ok(ControlFlow::Break(())), ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
} }
} else { } else {
let old_fbct = self.forbidden_conditions_to_nodes.clone(); let old_fbct = self.nodes_to_skip.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?; let cf = self.visit_node(dest_node, visit, ctx)?;
self.forbidden_conditions_to_nodes = old_fbct; self.nodes_to_skip = old_fbct;
Ok(cf) Ok(cf)
} }
} }
/// Visits a conditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_condition( fn visit_condition(
&mut self, &mut self,
condition: Interned<G::Condition>, condition: Interned<G::Condition>,
@ -159,7 +234,7 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
assert!(dest_node != ctx.graph.query_graph.end_node); assert!(dest_node != ctx.graph.query_graph.end_node);
if self.forbidden_conditions.contains(condition) if self.forbidden_conditions.contains(condition)
|| self.forbidden_conditions_to_nodes.contains(dest_node) || self.nodes_to_skip.contains(dest_node)
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes) || edge_new_nodes_to_skip.intersects(&self.visited_nodes)
{ {
return Ok(ControlFlow::Continue(false)); return Ok(ControlFlow::Continue(false));
@ -180,19 +255,19 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
self.visited_nodes.insert(dest_node); self.visited_nodes.insert(dest_node);
self.visited_conditions.insert(condition); self.visited_conditions.insert(condition);
let old_fc = self.forbidden_conditions.clone(); let old_forb_cond = self.forbidden_conditions.clone();
if let Some(next_forbidden) = if let Some(next_forbidden) =
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied()) ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
{ {
self.forbidden_conditions.union(&next_forbidden); self.forbidden_conditions.union(&next_forbidden);
} }
let old_fctn = self.forbidden_conditions_to_nodes.clone(); let old_nodes_to_skip = self.nodes_to_skip.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?; let cf = self.visit_node(dest_node, visit, ctx)?;
self.forbidden_conditions_to_nodes = old_fctn; self.nodes_to_skip = old_nodes_to_skip;
self.forbidden_conditions = old_fc; self.forbidden_conditions = old_forb_cond;
self.visited_conditions.remove(condition); self.visited_conditions.remove(condition);
self.visited_nodes.remove(dest_node); self.visited_nodes.remove(dest_node);
@ -205,18 +280,12 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> { pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]); let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
let mut node_stack = VecDeque::new(); self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
if cur_node == self.query_graph.end_node {
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
return;
for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { }
node_stack.push_back(prev_node);
enqueued.insert(prev_node);
}
while let Some(cur_node) = node_stack.pop_front() {
let mut self_costs = Vec::<u64>::new(); let mut self_costs = Vec::<u64>::new();
let cur_node_edges = &self.edges_of_node.get(cur_node); let cur_node_edges = &self.edges_of_node.get(cur_node);
@ -232,13 +301,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
self_costs.dedup(); self_costs.dedup();
*costs_to_end.get_mut(cur_node) = self_costs; *costs_to_end.get_mut(cur_node) = self_costs;
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { });
if !enqueued.contains(prev_node) {
node_stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
costs_to_end costs_to_end
} }
@ -247,17 +310,12 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
node_with_removed_outgoing_conditions: Interned<QueryNode>, node_with_removed_outgoing_conditions: Interned<QueryNode>,
costs: &mut MappedInterner<QueryNode, Vec<u64>>, costs: &mut MappedInterner<QueryNode, Vec<u64>>,
) { ) {
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
let mut node_stack = VecDeque::new(); // We first check that no other node is contributing the same total cost to a predecessor before removing
// the cost from the predecessor.
enqueued.insert(node_with_removed_outgoing_conditions); self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
node_stack.push_back(node_with_removed_outgoing_conditions);
'main_loop: while let Some(cur_node) = node_stack.pop_front() {
let mut costs_to_remove = FxHashSet::default(); let mut costs_to_remove = FxHashSet::default();
for c in costs.get(cur_node) { costs_to_remove.extend(costs.get(cur_node).iter().copied());
costs_to_remove.insert(*c);
}
let cur_node_edges = &self.edges_of_node.get(cur_node); let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() { for edge_idx in cur_node_edges.iter() {
@ -265,22 +323,75 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
for cost in costs.get(edge.dest_node).iter() { for cost in costs.get(edge.dest_node).iter() {
costs_to_remove.remove(&(*cost + edge.cost as u64)); costs_to_remove.remove(&(*cost + edge.cost as u64));
if costs_to_remove.is_empty() { if costs_to_remove.is_empty() {
continue 'main_loop; return;
} }
} }
} }
if costs_to_remove.is_empty() { if costs_to_remove.is_empty() {
continue 'main_loop; return;
} }
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
for c in costs_to_remove { for c in costs_to_remove {
new_costs.remove(&c); new_costs.remove(&c);
} }
*costs.get_mut(cur_node) = new_costs.into_iter().collect(); *costs.get_mut(cur_node) = new_costs.into_iter().collect();
});
}
/// Traverse the graph backwards from the given node such that every time
/// a node is visited, we are guaranteed that all its successors either:
/// 1. have already been visited; OR
/// 2. were not reachable from the given node
pub fn traverse_breadth_first_backward(
&self,
from: Interned<QueryNode>,
mut visit: impl FnMut(Interned<QueryNode>),
) {
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
{
// go backward to get the set of all reachable nodes from the given node
// the nodes that are not reachable will be set as `visited`
let mut stack = VecDeque::new();
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
enqueued.insert(from);
stack.push_back(from);
while let Some(n) = stack.pop_front() {
if reachable.contains(n) {
continue;
}
reachable.insert(n);
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
};
let mut unreachable_or_visited =
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
for (n, _) in self.query_graph.nodes.iter() {
if !reachable.contains(n) {
unreachable_or_visited.insert(n);
}
}
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
let mut stack = VecDeque::new();
enqueued.insert(from);
stack.push_back(from);
while let Some(cur_node) = stack.pop_front() {
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
stack.push_back(cur_node);
continue;
}
unreachable_or_visited.insert(cur_node);
visit(cur_node);
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) { if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
node_stack.push_back(prev_node); stack.push_back(prev_node);
enqueued.insert(prev_node); enqueued.insert(prev_node);
} }
} }

View File

@ -9,12 +9,8 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::Result; use crate::Result;
// TODO: give a generation to each universe, then be able to get the exact
// delta of docids between two universes of different generations!
/// A cache storing the document ids associated with each ranking rule edge /// A cache storing the document ids associated with each ranking rule edge
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> { pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
// TOOD: should be a mapped interner?
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>, pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
_phantom: PhantomData<G>, _phantom: PhantomData<G>,
} }
@ -54,7 +50,7 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
} }
let condition = graph.conditions_interner.get_mut(interned_condition); let condition = graph.conditions_interner.get_mut(interned_condition);
let computed = G::resolve_condition(ctx, condition, universe)?; let computed = G::resolve_condition(ctx, condition, universe)?;
// TODO: if computed.universe_len != universe.len() ? // Can we put an assert here for computed.universe_len == universe.len() ?
let _ = self.cache.insert(interned_condition, computed); let _ = self.cache.insert(interned_condition, computed);
let computed = &self.cache[&interned_condition]; let computed = &self.cache[&interned_condition];
Ok(computed) Ok(computed)

View File

@ -2,6 +2,7 @@ use crate::search::new::interner::{FixedSizeInterner, Interned};
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
pub struct DeadEndsCache<T> { pub struct DeadEndsCache<T> {
// conditions and next could/should be part of the same vector
conditions: Vec<Interned<T>>, conditions: Vec<Interned<T>>,
next: Vec<Self>, next: Vec<Self>,
pub forbidden: SmallBitmap<T>, pub forbidden: SmallBitmap<T>,
@ -27,7 +28,7 @@ impl<T> DeadEndsCache<T> {
self.forbidden.insert(condition); self.forbidden.insert(condition);
} }
pub fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> { fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
if let Some(idx) = self.conditions.iter().position(|c| *c == condition) { if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
Some(&mut self.next[idx]) Some(&mut self.next[idx])
} else { } else {

View File

@ -69,14 +69,9 @@ impl RankingRuleGraphTrait for FidGraph {
let mut edges = vec![]; let mut edges = vec![];
for fid in all_fields { for fid in all_fields {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each field ids fetched.
edges.push(( edges.push((
fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. fid as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { conditions_interner.insert(FidCondition { term: term.clone(), fid }),
term: term.clone(), // TODO remove this ugly clone
fid,
}),
)); ));
} }

View File

@ -20,6 +20,8 @@ mod position;
mod proximity; mod proximity;
/// Implementation of the `typo` ranking rule /// Implementation of the `typo` ranking rule
mod typo; mod typo;
/// Implementation of the `words` ranking rule
mod words;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::hash::Hash; use std::hash::Hash;
@ -33,6 +35,7 @@ pub use position::{PositionCondition, PositionGraph};
pub use proximity::{ProximityCondition, ProximityGraph}; pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
pub use typo::{TypoCondition, TypoGraph}; pub use typo::{TypoCondition, TypoGraph};
pub use words::{WordsCondition, WordsGraph};
use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
use super::query_term::LocatedQueryTermSubset; use super::query_term::LocatedQueryTermSubset;

View File

@ -94,14 +94,9 @@ impl RankingRuleGraphTrait for PositionGraph {
let mut edges = vec![]; let mut edges = vec![];
for (cost, positions) in positions_for_costs { for (cost, positions) in positions_for_costs {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each position fetched
edges.push(( edges.push((
cost, cost,
conditions_interner.insert(PositionCondition { conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
term: term.clone(), // TODO remove this ugly clone
positions,
}),
)); ));
} }
@ -111,23 +106,16 @@ impl RankingRuleGraphTrait for PositionGraph {
fn cost_from_position(sum_positions: u32) -> u32 { fn cost_from_position(sum_positions: u32) -> u32 {
match sum_positions { match sum_positions {
0 | 1 | 2 | 3 => sum_positions, 0 => 0,
4 | 5 => 4, 1 => 1,
6 | 7 => 5, 2..=4 => 2,
8 | 9 => 6, 5..=7 => 3,
10 | 11 => 7, 8..=11 => 4,
12 | 13 => 8, 12..=16 => 5,
14 | 15 => 9, 17..=24 => 6,
16 | 17..=24 => 10, 25..=64 => 7,
25..=32 => 11, 65..=256 => 8,
33..=64 => 12, 257..=1024 => 9,
65..=128 => 13, _ => 10,
129..=256 => 14,
257..=512 => 15,
513..=1024 => 16,
1025..=2048 => 17,
2049..=4096 => 18,
4097..=8192 => 19,
_ => 20,
} }
} }

View File

@ -65,13 +65,6 @@ pub fn compute_docids(
} }
} }
// TODO: add safeguard in case the cartesian product is too large!
// even if we restrict the word derivations to a maximum of 100, the size of the
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? { for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
// Before computing the edges, check that the left word and left phrase // Before computing the edges, check that the left word and left phrase
// aren't disjoint with the universe, but only do it if there is more than // aren't disjoint with the universe, but only do it if there is more than
@ -111,8 +104,6 @@ pub fn compute_docids(
Ok(ComputedCondition { Ok(ComputedCondition {
docids, docids,
universe_len: universe.len(), universe_len: universe.len(),
// TODO: think about whether we want to reduce the subset,
// we probably should!
start_term_subset: Some(left_term.clone()), start_term_subset: Some(left_term.clone()),
end_term_subset: right_term.clone(), end_term_subset: right_term.clone(),
}) })
@ -203,12 +194,7 @@ fn compute_non_prefix_edges(
*docids |= new_docids; *docids |= new_docids;
} }
} }
if backward_proximity >= 1 if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
// TODO: for now, we don't do any swapping when either term is a phrase
// but maybe we should. We'd need to look at the first/last word of the phrase
// depending on the context.
&& left_phrase.is_none() && right_phrase.is_none()
{
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
{ {

View File

@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
// 3-gram -> equivalent to 2 typos // 3-gram -> equivalent to 2 typos
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
let mut term = term.clone(); let mut term = term.clone();
match nbr_typos { match nbr_typos {
0 => { 0 => {

View File

@ -0,0 +1,49 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct WordsCondition {
term: LocatedQueryTermSubset,
}
pub enum WordsGraph {}
impl RankingRuleGraphTrait for WordsGraph {
type Condition = WordsCondition;
fn resolve_condition(
ctx: &mut SearchContext,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let WordsCondition { term, .. } = condition;
// maybe compute_query_term_subset_docids should accept a universe as argument
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
docids &= universe;
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
fn build_edges(
_ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
Ok(vec![(
to_term.term_ids.len() as u32,
conditions_interner.insert(WordsCondition { term: to_term.clone() }),
)])
}
}

View File

@ -33,8 +33,6 @@ pub fn compute_query_term_subset_docids(
ctx: &mut SearchContext, ctx: &mut SearchContext,
term: &QueryTermSubset, term: &QueryTermSubset,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.word_docids(word)? { if let Some(word_docids) = ctx.word_docids(word)? {
@ -59,8 +57,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
term: &QueryTermSubset, term: &QueryTermSubset,
fid: u16, fid: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
@ -71,7 +67,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
// There may be false positives when resolving a phrase, so we're not // There may be false positives when resolving a phrase, so we're not
// guaranteed that all of its words are within a single fid. // guaranteed that all of its words are within a single fid.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
@ -95,7 +90,6 @@ pub fn compute_query_term_subset_docids_within_position(
term: &QueryTermSubset, term: &QueryTermSubset,
position: u16, position: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_position_docids) = if let Some(word_position_docids) =
@ -108,7 +102,6 @@ pub fn compute_query_term_subset_docids_within_position(
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
// It's difficult to know the expected position of the words in the phrase, // It's difficult to know the expected position of the words in the phrase,
// so instead we just check the first one. // so instead we just check the first one.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
@ -132,9 +125,6 @@ pub fn compute_query_graph_docids(
q: &QueryGraph, q: &QueryGraph,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO: there must be a faster way to compute this big
// roaring bitmap expression
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new()); let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());

View File

@ -141,10 +141,6 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Query>>> { ) -> Result<Option<RankingRuleOutput<Query>>> {
let iter = self.iter.as_mut().unwrap(); let iter = self.iter.as_mut().unwrap();
// TODO: we should make use of the universe in the function below
// good for correctness, but ideally iter.next_bucket would take the current universe into account,
// as right now it could return buckets that don't intersect with the universe, meaning we will make many
// unneeded calls.
if let Some(mut bucket) = iter.next_bucket()? { if let Some(mut bucket) = iter.next_bucket()? {
bucket.candidates &= universe; bucket.candidates &= universe;
Ok(Some(bucket)) Ok(Some(bucket))

View File

@ -138,7 +138,7 @@ fn test_attribute_position_simple() {
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("quick brown"); s.query("quick brown");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
} }
#[test] #[test]
fn test_attribute_position_repeated() { fn test_attribute_position_repeated() {
@ -163,7 +163,7 @@ fn test_attribute_position_different_fields() {
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("quick brown"); s.query("quick brown");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
} }
#[test] #[test]
@ -176,5 +176,5 @@ fn test_attribute_position_ngrams() {
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("quick brown"); s.query("quick brown");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
} }

View File

@ -527,7 +527,7 @@ fn test_distinct_all_candidates() {
let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
let candidates = candidates.iter().collect::<Vec<_>>(); let candidates = candidates.iter().collect::<Vec<_>>();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
// TODO: this is incorrect! // This is incorrect, but unfortunately impossible to do better efficiently.
insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]"); insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]");
} }

View File

@ -125,8 +125,8 @@ fn create_edge_cases_index() -> TempIndex {
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents. // If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// If the query is `sunflower wilting`, then we should make sure that // If the query is `sunflower wilting`, then we should make sure that
// the sprximity condition `flower wilting: sprx N` also comes with the condition // the proximity condition `flower wilting: sprx N` also comes with the condition
// `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. // `sun wilting: sprx N+1`, but this is not the exact condition we use for now.
// We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
// is better than nothing but not the best. // is better than nothing but not the best.
{ {
@ -299,7 +299,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -316,7 +316,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -341,7 +341,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",

View File

@ -2,9 +2,8 @@
This module tests the interactions between the proximity and typo ranking rules. This module tests the interactions between the proximity and typo ranking rules.
The proximity ranking rule should transform the query graph such that it The proximity ranking rule should transform the query graph such that it
only contains the word pairs that it used to compute its bucket. only contains the word pairs that it used to compute its bucket, but this is not currently
implemented.
TODO: This is not currently implemented.
*/ */
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
@ -64,7 +63,7 @@ fn test_trap_basic() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: this is incorrect, 1 should come before 0 // This is incorrect, 1 should come before 0
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"summer. holiday. sommer holidty\"", "\"summer. holiday. sommer holidty\"",

View File

@ -571,8 +571,8 @@ fn test_typo_synonyms() {
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the fast brownish fox jumps over the lackadaisical dog"); s.query("the fast brownish fox jumps over the lackadaisical dog");
// TODO: is this correct? interaction of ngrams + synonyms means that the // The interaction of ngrams + synonyms means that the multi-word synonyms end up having a typo cost.
// multi-word synonyms end up having a typo cost. This is probably not what we want. // This is probably not what we want.
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);

View File

@ -1,87 +0,0 @@
use roaring::RoaringBitmap;
use super::logger::SearchLogger;
use super::query_graph::QueryNode;
use super::resolve_query_graph::compute_query_graph_docids;
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::{Result, TermsMatchingStrategy};
pub struct Words {
exhausted: bool, // TODO: remove
query_graph: Option<QueryGraph>,
nodes_to_remove: Vec<SmallBitmap<QueryNode>>,
terms_matching_strategy: TermsMatchingStrategy,
}
impl Words {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
Self {
exhausted: true,
query_graph: None,
nodes_to_remove: vec![],
terms_matching_strategy,
}
}
}
impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
fn id(&self) -> String {
"words".to_owned()
}
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
_universe: &RoaringBitmap,
parent_query_graph: &QueryGraph,
) -> Result<()> {
self.exhausted = false;
self.query_graph = Some(parent_query_graph.clone());
self.nodes_to_remove = match self.terms_matching_strategy {
TermsMatchingStrategy::Last => {
let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx);
ns.reverse();
ns
}
TermsMatchingStrategy::All => {
vec![]
}
};
Ok(())
}
fn next_bucket(
&mut self,
ctx: &mut SearchContext<'ctx>,
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
if self.exhausted {
return Ok(None);
}
let Some(query_graph) = &mut self.query_graph else { panic!() };
logger.log_internal_state(query_graph);
let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?;
let child_query_graph = query_graph.clone();
if self.nodes_to_remove.is_empty() {
self.exhausted = true;
} else {
let nodes_to_remove = self.nodes_to_remove.pop().unwrap();
query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>());
}
Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
}
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.exhausted = true;
self.nodes_to_remove = vec![];
self.query_graph = None;
}
}

View File

@ -89,7 +89,6 @@ Create a snapshot test of the given database.
- `exact_word_docids` - `exact_word_docids`
- `word_prefix_docids` - `word_prefix_docids`
- `exact_word_prefix_docids` - `exact_word_prefix_docids`
- `docid_word_positions`
- `word_pair_proximity_docids` - `word_pair_proximity_docids`
- `word_prefix_pair_proximity_docids` - `word_prefix_pair_proximity_docids`
- `word_position_docids` - `word_position_docids`
@ -217,11 +216,6 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
&format!("{s:<16} {}", display_bitmap(&b)) &format!("{s:<16} {}", display_bitmap(&b))
}) })
} }
pub fn snap_docid_word_positions(index: &Index) -> String {
make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
&format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
})
}
pub fn snap_word_pair_proximity_docids(index: &Index) -> String { pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| { make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
@ -324,7 +318,7 @@ pub fn snap_field_distributions(index: &Index) -> String {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let mut snap = String::new(); let mut snap = String::new();
for (field, count) in index.field_distribution(&rtxn).unwrap() { for (field, count) in index.field_distribution(&rtxn).unwrap() {
writeln!(&mut snap, "{field:<16} {count:<6}").unwrap(); writeln!(&mut snap, "{field:<16} {count:<6} |").unwrap();
} }
snap snap
} }
@ -334,7 +328,7 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
let mut snap = String::new(); let mut snap = String::new();
for field_id in fields_ids_map.ids() { for field_id in fields_ids_map.ids() {
let name = fields_ids_map.name(field_id).unwrap(); let name = fields_ids_map.name(field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); writeln!(&mut snap, "{field_id:<3} {name:<16} |").unwrap();
} }
snap snap
} }
@ -477,9 +471,6 @@ macro_rules! full_snap_of_db {
($index:ident, exact_word_prefix_docids) => {{ ($index:ident, exact_word_prefix_docids) => {{
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
}}; }};
($index:ident, docid_word_positions) => {{
$crate::snapshot_tests::snap_docid_word_positions(&$index)
}};
($index:ident, word_pair_proximity_docids) => {{ ($index:ident, word_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
}}; }};

View File

@ -1,7 +1,7 @@
--- ---
source: milli/src/index.rs source: milli/src/index.rs
--- ---
age 1 age 1 |
id 2 id 2 |
name 2 name 2 |

View File

@ -1,7 +1,7 @@
--- ---
source: milli/src/index.rs source: milli/src/index.rs
--- ---
age 1 age 1 |
id 2 id 2 |
name 2 name 2 |

View File

@ -23,7 +23,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids, prefix_word_pair_proximity_docids,
@ -80,7 +79,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?;
exact_word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?;
docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?;
prefix_word_pair_proximity_docids.clear(self.wtxn)?; prefix_word_pair_proximity_docids.clear(self.wtxn)?;
@ -141,7 +139,6 @@ mod tests {
assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());

View File

@ -1,5 +1,5 @@
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
@ -15,8 +15,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{ use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
SmallString32, BEU32,
}; };
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
@ -72,7 +71,6 @@ impl std::fmt::Display for DeletionStrategy {
pub(crate) struct DetailedDocumentDeletionResult { pub(crate) struct DetailedDocumentDeletionResult {
pub deleted_documents: u64, pub deleted_documents: u64,
pub remaining_documents: u64, pub remaining_documents: u64,
pub soft_deletion_used: bool,
} }
impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
@ -109,11 +107,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
Some(docid) Some(docid)
} }
pub fn execute(self) -> Result<DocumentDeletionResult> { pub fn execute(self) -> Result<DocumentDeletionResult> {
let DetailedDocumentDeletionResult { let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
deleted_documents, self.execute_inner()?;
remaining_documents,
soft_deletion_used: _,
} = self.execute_inner()?;
Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
} }
@ -134,7 +129,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult { return Ok(DetailedDocumentDeletionResult {
deleted_documents: 0, deleted_documents: 0,
remaining_documents: 0, remaining_documents: 0,
soft_deletion_used: false,
}); });
} }
@ -150,7 +144,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult { return Ok(DetailedDocumentDeletionResult {
deleted_documents: current_documents_ids_len, deleted_documents: current_documents_ids_len,
remaining_documents, remaining_documents,
soft_deletion_used: false,
}); });
} }
@ -219,7 +212,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult { return Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(), deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(), remaining_documents: documents_ids.len(),
soft_deletion_used: true,
}); });
} }
@ -232,7 +224,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
field_id_word_count_docids, field_id_word_count_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
@ -251,23 +242,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
documents, documents,
} = self.index; } = self.index;
// Remove from the documents database
// Retrieve the words contained in the documents.
let mut words = Vec::new();
for docid in &self.to_delete_docids { for docid in &self.to_delete_docids {
documents.delete(self.wtxn, &BEU32::new(docid))?; documents.delete(self.wtxn, &BEU32::new(docid))?;
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
// We create an iterator to be able to get the content and delete the key-value itself.
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
while let Some(result) = iter.next() {
let ((_docid, word), _positions) = result?;
// This boolean will indicate if we must remove this word from the words FST.
words.push((SmallString32::from(word), false));
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
}
} }
// We acquire the current external documents ids map... // We acquire the current external documents ids map...
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids` // Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
@ -278,42 +255,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let new_external_documents_ids = new_external_documents_ids.into_static(); let new_external_documents_ids = new_external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
// Maybe we can improve the get performance of the words let mut words_to_keep = BTreeSet::default();
// if we sort the words first, keeping the LMDB pages in cache. let mut words_to_delete = BTreeSet::default();
words.sort_unstable();
// We iterate over the words and delete the documents ids // We iterate over the words and delete the documents ids
// from the word docids database. // from the word docids database.
for (word, must_remove) in &mut words { remove_from_word_docids(
remove_from_word_docids( self.wtxn,
self.wtxn, word_docids,
word_docids, &self.to_delete_docids,
word.as_str(), &mut words_to_keep,
must_remove, &mut words_to_delete,
&self.to_delete_docids, )?;
)?; remove_from_word_docids(
self.wtxn,
remove_from_word_docids( exact_word_docids,
self.wtxn, &self.to_delete_docids,
exact_word_docids, &mut words_to_keep,
word.as_str(), &mut words_to_delete,
must_remove, )?;
&self.to_delete_docids,
)?;
}
// We construct an FST set that contains the words to delete from the words FST. // We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
words.iter().filter_map(
|(word, must_remove)| {
if *must_remove {
Some(word.as_str())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = { let new_words_fst = {
// We retrieve the current words FST from the database. // We retrieve the current words FST from the database.
@ -472,7 +434,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
Ok(DetailedDocumentDeletionResult { Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(), deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(), remaining_documents: documents_ids.len(),
soft_deletion_used: false,
}) })
} }
@ -532,23 +493,24 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids( fn remove_from_word_docids(
txn: &mut heed::RwTxn, txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>, db: &heed::Database<Str, RoaringBitmapCodec>,
word: &str,
must_remove: &mut bool,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,
) -> Result<()> { ) -> Result<()> {
// We create an iterator to be able to get the content and delete the word docids. // We create an iterator to be able to get the content and delete the word docids.
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
// the LMDB B-Tree two times but only once. // the LMDB B-Tree two times but only once.
let mut iter = db.prefix_iter_mut(txn, word)?; let mut iter = db.iter_mut(txn)?;
if let Some((key, mut docids)) = iter.next().transpose()? { while let Some((key, mut docids)) = iter.next().transpose()? {
if key == word { let previous_len = docids.len();
let previous_len = docids.len(); docids -= to_remove;
docids -= to_remove; if docids.is_empty() {
if docids.is_empty() { // safety: we don't keep references from inside the LMDB database.
// safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? };
unsafe { iter.del_current()? }; words_to_remove.insert(key.to_owned());
*must_remove = true; } else {
} else if docids.len() != previous_len { words_to_keep.insert(key.to_owned());
if docids.len() != previous_len {
let key = key.to_owned(); let key = key.to_owned();
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &docids)? }; unsafe { iter.put_current(&key, &docids)? };
@ -627,7 +589,7 @@ mod tests {
use super::*; use super::*;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::{db_snap, Filter}; use crate::{db_snap, Filter, Search};
fn delete_documents<'t>( fn delete_documents<'t>(
wtxn: &mut RwTxn<'t, '_>, wtxn: &mut RwTxn<'t, '_>,
@ -1199,4 +1161,52 @@ mod tests {
DeletionStrategy::AlwaysSoft, DeletionStrategy::AlwaysSoft,
); );
} }
#[test]
fn delete_words_exact_attributes() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_searchable_fields(vec![S("text"), S("exact")]);
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "text": "hello" },
{ "id": 1, "exact": "hello"}
]))
.unwrap();
db_snap!(index, word_docids, 1, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 1, @r###"
hello [1, ]
"###);
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
let mut wtxn = index.write_txn().unwrap();
let deleted_internal_ids =
delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
wtxn.commit().unwrap();
db_snap!(index, word_docids, 2, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 2, @"");
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
let txn = index.read_txn().unwrap();
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
let mut s = Search::new(&txn, &index);
s.query("hello");
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
}
} }

View File

@ -261,7 +261,9 @@ pub(crate) mod test_helpers {
let options = options.map_size(4096 * 4 * 1000 * 100); let options = options.map_size(4096 * 4 * 1000 * 100);
let tempdir = tempfile::TempDir::new().unwrap(); let tempdir = tempfile::TempDir::new().unwrap();
let env = options.open(tempdir.path()).unwrap(); let env = options.open(tempdir.path()).unwrap();
let content = env.create_database(None).unwrap(); let mut wtxn = env.write_txn().unwrap();
let content = env.create_database(&mut wtxn, None).unwrap();
wtxn.commit().unwrap();
FacetIndex { FacetIndex {
content, content,

Some files were not shown because too many files have changed in this diff Show More