1421: Transplant the new search engine r=tpayet a=curquiza



Co-authored-by: tamo <tamo@meilisearch.com>
Co-authored-by: Marin Postma <postma.marin@protonmail.com>
Co-authored-by: bors[bot] <26634292+bors[bot]@users.noreply.github.com>
Co-authored-by: Irevoire <tamo@meilisearch.com>
Co-authored-by: marin <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2021-06-30 14:14:11 +00:00 committed by GitHub
commit d61852a73f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
203 changed files with 11892 additions and 25396 deletions

13
.github/release-draft-template.yml vendored Normal file
View File

@ -0,0 +1,13 @@
name-template: 'v$RESOLVED_VERSION'
tag-template: 'v$RESOLVED_VERSION'
version-template: '0.21.0-alpha.$PATCH'
exclude-labels:
- 'skip-changelog'
template: |
## Changes
$CHANGES
no-changes-template: 'Changes are coming soon 😎'
sort-direction: 'ascending'
version-resolver:
default: patch

15
.github/workflows/flaky.yml vendored Normal file
View File

@ -0,0 +1,15 @@
name: Look for flaky tests
on:
schedule:
- cron: "0 12 * * FRI" # every friday at 12:00PM
jobs:
flaky:
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2
- name: Install cargo-flaky
run: cargo install cargo-flaky
- name: Run cargo flaky 100 times
run: cargo flaky -i 100 --release

View File

@ -13,6 +13,9 @@ jobs:
- name: Check if current release is latest - name: Check if current release is latest
run: echo "##[set-output name=is_latest;]$(sh .github/is-latest-release.sh)" run: echo "##[set-output name=is_latest;]$(sh .github/is-latest-release.sh)"
id: release id: release
- name: Set COMMIT_DATE env variable
run: |
echo "COMMIT_DATE=$( git log --pretty=format:'%ad' -n1 --date=short )" >> $GITHUB_ENV
- name: Publish to Registry - name: Publish to Registry
if: steps.release.outputs.is_latest == 'true' if: steps.release.outputs.is_latest == 'true'
uses: elgohr/Publish-Docker-Github-Action@master uses: elgohr/Publish-Docker-Github-Action@master
@ -20,3 +23,5 @@ jobs:
name: getmeili/meilisearch name: getmeili/meilisearch
username: ${{ secrets.DOCKER_USERNAME }} username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }} password: ${{ secrets.DOCKER_PASSWORD }}
tag_names: true
buildargs: COMMIT_SHA,COMMIT_DATE

View File

@ -11,10 +11,16 @@ jobs:
runs-on: ubuntu-18.04 runs-on: ubuntu-18.04
steps: steps:
- uses: actions/checkout@v1 - uses: actions/checkout@v1
- name: Set COMMIT_DATE env variable
run: |
echo "COMMIT_DATE=$( git log --pretty=format:'%ad' -n1 --date=short )" >> $GITHUB_ENV
- name: Publish to Registry - name: Publish to Registry
uses: elgohr/Publish-Docker-Github-Action@master uses: elgohr/Publish-Docker-Github-Action@master
env:
COMMIT_SHA: ${{ github.sha }}
with: with:
name: getmeili/meilisearch name: getmeili/meilisearch
username: ${{ secrets.DOCKER_USERNAME }} username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }} password: ${{ secrets.DOCKER_PASSWORD }}
tag_names: true tag_names: true
buildargs: COMMIT_SHA,COMMIT_DATE

16
.github/workflows/release-drafter.yml vendored Normal file
View File

@ -0,0 +1,16 @@
name: Release Drafter
on:
push:
branches:
- main
jobs:
update_release_draft:
runs-on: ubuntu-latest
steps:
- uses: release-drafter/release-drafter@v5
with:
config-name: release-draft-template.yml
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_DRAFTER_TOKEN }}

81
.github/workflows/rust.yml vendored Normal file
View File

@ -0,0 +1,81 @@
name: Rust
on:
workflow_dispatch:
pull_request:
push:
# trying and staging branches are for Bors config
branches:
- trying
- staging
env:
CARGO_TERM_COLOR: always
jobs:
tests:
name: Tests on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-18.04, macos-latest]
steps:
- uses: actions/checkout@v2
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
command: build
args: --locked --release --no-default-features
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
command: test
args: --locked --release
# We don't run test on Windows since we get the following error: There is not enough space on the disk.
check-on-windows:
name: Cargo check on Windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
command: check
args: --no-default-features
- name: Run cargo check with all default features
uses: actions-rs/cargo@v1
with:
command: check
clippy:
name: Run Clippy
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
components: clippy
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets -- --deny warnings
fmt:
name: Run Rustfmt
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly
override: true
components: rustfmt
- name: Run cargo fmt
run: cargo fmt --all -- --check

View File

@ -1,94 +0,0 @@
---
on:
push:
branches:
- release-v*
- trying
- staging
tags:
- 'v[0-9]+.[0-9]+.[0-9]+' # this only concerns tags on stable
name: Test binaries with cargo test
jobs:
check:
name: Test on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-18.04, macos-latest]
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
components: clippy
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
command: test
args: --locked --release
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets
build-image:
name: Test the build of Docker image
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v1
- run: docker build . --file Dockerfile -t meilisearch
name: Docker build
## A push occurred on a release branch, a prerelease is created and assets are generated
prerelease:
name: create prerelease
needs: [check, build-image]
if: ${{ contains(github.ref, 'release-') && github.event_name == 'push' }}
runs-on: ubuntu-18.04
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Get version number
id: version-number
run: echo "##[set-output name=number;]$(echo ${{ github.ref }} | sed 's/.*\(v.*\)/\1/')"
- name: Get commit count
id: commit-count
run: echo "##[set-output name=count;]$(git rev-list remotes/origin/master..remotes/origin/release-${{ steps.version-number.outputs.number }} --count)"
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.PUBLISH_TOKEN }} # Personal Access Token
with:
tag_name: ${{ steps.version-number.outputs.number }}rc${{ steps.commit-count.outputs.count }}
release_name: Pre-release ${{ steps.version-number.outputs.number }}-rc${{ steps.commit-count.outputs.count }}
prerelease: true
## If a tag is pushed, a release is created for this tag, and assets will be generated
release:
name: create release
needs: [check, build-image]
if: ${{ contains(github.ref, 'tags/v') }}
runs-on: ubuntu-18.04
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Get version number
id: version-number
run: echo "##[set-output name=number;]$(echo ${{ github.ref }} | sed 's/.*\(v.*\)/\1/')"
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.PUBLISH_TOKEN }} # PAT
with:
tag_name: ${{ steps.version-number.outputs.number }}
release_name: Meilisearch ${{ steps.version-number.outputs.number }}
prerelease: false

1
.gitignore vendored
View File

@ -1,5 +1,4 @@
/target /target
meilisearch-core/target
**/*.csv **/*.csv
**/*.json_lines **/*.json_lines
**/*.rs.bk **/*.rs.bk

View File

@ -1,126 +0,0 @@
## v0.20.0 - 2021-03-22
- Fix build on mac M1 (#1280)
- Server root returns 200 in production (#1292)
- Healthcheck returns 200 (#1291)
- Snapshot temporary files are not created in /tmp anymore (#1238)
## v0.19.0 - 2021-02-09
- The snapshots are now created and then renamed in atomically (#1172)
- Fix a race condition when an update and a document addition are processed immediately one after the other (#1176)
- Latin synonyms are normalized during indexation (#1174)
## v0.18.1 - 2021-01-14
- Fix unexpected CORS error (#1185)
## v0.18.0 - 2021-01-11
- Integration with the new tokenizer (#1091)
- Fix setting consistency bug (#1128)
- Fix attributes to retrieve bug (#1131)
- Increase default payload size (#1147)
- Improvements to code quality (#1167, #1165, #1126, #1151)
## v0.17.0 - 2020-11-30
- Fix corrupted data during placeholder search (#1089)
- Remove maintenance error from http (#1082)
- Disable frontend in production (#1097)
- Update nbHits count with filtered documents (#849)
- Remove update changelog ci check (#1090)
- Add deploy on Platform.sh option to README (#1087)
- Change movie gifs in README (#1077)
- Remove some clippy warnings (#1100)
- Improve script `download-latest.sh` (#1054)
- Bump dependencies version (#1056, #1057, #1059)
## v0.16.0 - 2020-11-02
- Automatically create index on document push if index doesn't exist (#914)
- Sort displayedAttributes and facetDistribution (#946)
## v0.15.0 - 2020-09-30
- Update actix-web dependency to 3.0.0 (#963)
- Consider an empty query to be a placeholder search (#916)
## v0.14.1
- Fix version mismatch in snapshot importation (#959)
## v0.14.0
- Sort displayedAttributes (#943)
- Fix facet distribution case (#797)
- Snapshotting (#839)
- Fix bucket-sort unwrap bug (#915)
## v0.13.0
- placeholder search (#771)
- Add database version mismatch check (#794)
- Displayed and searchable attributes wildcard (#846)
- Remove sys-info route (#810)
- Check database version mismatch (#794)
- Fix unique docid bug (#841)
- Error codes in updates (#792)
- Sentry disable argument (#813)
- Log analytics if enabled (#825)
- Fix default values displayed on web interface (#874)
## v0.12.0
- Fix long documents not being indexed completely bug (#816)
- Fix distinct attribute returning id instead of name (#800)
- error code rename (#805)
## v0.11.1
- Fix facet cache on document update (#789)
- Improvements on settings consistency (#778)
## v0.11.0
- Change the HTTP framework, moving from tide to actix-web (#601)
- Bump sentry version to 0.18.1 (#690)
- Enable max payload size override (#684)
- Disable sentry in debug (#681)
- Better terminal greeting (#680)
- Fix highlight misalignment (#679)
- Add support for facet count (#676)
- Add support for faceted search (#631)
- Add support for configuring the lmdb map size (#646, #647)
- Add exposed port for Dockerfile (#654)
- Add sentry probe (#664)
- Fix url trailing slash and double slash issues (#659)
- Fix accept all Content-Type by default (#653)
- Return the error message from Serde when a deserialization error is encountered (#661)
- Fix NormalizePath middleware to make the dashboard accessible (#695)
- Update sentry features to remove openssl (#702)
- Add SSL support (#669)
- Rename fieldsFrequency into fieldsDistribution in stats (#719)
- Add support for error code reporting (#703)
- Allow the dashboard to query private servers (#732)
- Add telemetry (#720)
- Add post route for search (#735)
## v0.10.1
- Add support for floating points in filters (#640)
- Add '@' character as tokenizer separator (#607)
- Add support for filtering on arrays of strings (#611)
## v0.10.0
- Refined filtering (#592)
- Add the number of hits in search result (#541)
- Add support for aligned crop in search result (#543)
- Sanitize the content displayed in the web interface (#539)
- Add support of nested null, boolean and seq values (#571 and #568, #574)
- Fixed the core benchmark (#576)
- Publish an ARMv7 and ARMv8 binaries on releases (#540 and #581)
- Fixed a bug where the result of the update status after the first update was empty (#542)
- Fixed a bug where stop words were not handled correctly (#594)
- Fix CORS issues (#602)
- Support wildcard on attributes to retrieve, highlight, and crop (#549, #565, and #598)

2343
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,7 @@
[workspace] [workspace]
members = [ members = [
"meilisearch-core",
"meilisearch-http", "meilisearch-http",
"meilisearch-schema", "meilisearch-error",
"meilisearch-types",
] ]
[profile.release] [profile.release]

View File

@ -1,5 +1,5 @@
# Compile # Compile
FROM alpine:3.10 AS compiler FROM alpine:3.14 AS compiler
RUN apk update --quiet RUN apk update --quiet
RUN apk add curl RUN apk add curl
@ -9,14 +9,30 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
WORKDIR /meilisearch WORKDIR /meilisearch
COPY . . COPY Cargo.lock .
COPY Cargo.toml .
COPY meilisearch-error/Cargo.toml meilisearch-error/
COPY meilisearch-http/Cargo.toml meilisearch-http/
ENV RUSTFLAGS="-C target-feature=-crt-static" ENV RUSTFLAGS="-C target-feature=-crt-static"
# Create dummy main.rs files for each workspace member to be able to compile all the dependencies
RUN find . -type d -name "meilisearch-*" | xargs -I{} sh -c 'mkdir {}/src; echo "fn main() { }" > {}/src/main.rs;'
# Use `cargo build` instead of `cargo vendor` because we need to not only download but compile dependencies too
RUN $HOME/.cargo/bin/cargo build --release
# Cleanup dummy main.rs files
RUN find . -path "*/src/main.rs" -delete
ARG COMMIT_SHA
ARG COMMIT_DATE
ENV COMMIT_SHA=${COMMIT_SHA} COMMIT_DATE=${COMMIT_DATE}
COPY . .
RUN $HOME/.cargo/bin/cargo build --release RUN $HOME/.cargo/bin/cargo build --release
# Run # Run
FROM alpine:3.10 FROM alpine:3.14
RUN apk add -q --no-cache libgcc tini RUN apk add -q --no-cache libgcc tini

View File

@ -1,3 +1,9 @@
status = ["Test on macos-latest", "Test on ubuntu-18.04"] status = [
# 4 hours timeout 'Tests on ubuntu-18.04',
timeout-sec = 14400 'Tests on macos-latest',
'Cargo check on Windows',
'Run Clippy',
'Run Rustfmt'
]
# 3 hours timeout
timeout-sec = 10800

38
bump.sh
View File

@ -1,38 +0,0 @@
#!/usr/bin/bash
NEW_VERSION=$1
if [ -z "$NEW_VERSION" ]
then
echo "error: a version number must be provided"
exit 1
fi
# find current version
CURRENT_VERSION=$(cat **/*.toml | grep meilisearch | grep version | sed 's/.*\([0-9]\+\.[0-9]\+\.[0-9]\+\).*/\1/' | sed "1q;d")
# bump all version in .toml
echo "bumping from version $CURRENT_VERSION to version $NEW_VERSION"
while true
do
read -r -p "Continue (y/n)?" choice
case "$choice" in
y|Y ) break;;
n|N ) echo "aborting bump" && exit 0;;
* ) echo "invalid choice";;
esac
done
# update all crate version
sed -i "s/version = \"$CURRENT_VERSION\"/version = \"$NEW_VERSION\"/" **/*.toml
printf "running cargo check: "
CARGO_CHECK=$(cargo check 2>&1)
if [ $? != "0" ]
then
printf "\033[31;1m FAIL \033[0m\n"
printf "$CARGO_CHECK"
exit 1
fi
printf "\033[32;1m OK \033[0m\n"

View File

@ -1,53 +0,0 @@
[package]
name = "meilisearch-core"
version = "0.20.0"
license = "MIT"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
arc-swap = "1.2.0"
bincode = "1.3.1"
byteorder = "1.3.4"
chrono = { version = "0.4.19", features = ["serde"] }
compact_arena = "0.4.1"
cow-utils = "0.1.2"
crossbeam-channel = "0.5.0"
deunicode = "1.1.1"
either = "1.6.1"
env_logger = "0.8.2"
fst = "0.4.5"
hashbrown = { version = "0.9.1", features = ["serde"] }
heed = "0.10.6"
indexmap = { version = "1.6.1", features = ["serde-1"] }
intervaltree = "0.2.6"
itertools = "0.10.0"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
log = "0.4.11"
meilisearch-error = { path = "../meilisearch-error", version = "0.20.0" }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.20.0" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.3" }
meilisearch-types = { path = "../meilisearch-types", version = "0.20.0" }
once_cell = "1.5.2"
ordered-float = { version = "2.0.1", features = ["serde"] }
pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" }
pest_derive = "2.1.0"
regex = "1.4.2"
sdset = "0.4.0"
serde = { version = "1.0.118", features = ["derive"] }
serde_json = { version = "1.0.61", features = ["preserve_order"] }
slice-group-by = "0.2.6"
unicase = "2.6.0"
zerocopy = "0.3.0"
[dev-dependencies]
assert_matches = "1.4.0"
criterion = "0.3.3"
csv = "1.1.5"
rustyline = { version = "7.1.0", default-features = false }
structopt = "0.3.21"
tempfile = "3.1.0"
termcolor = "1.1.2"
[target.'cfg(unix)'.dev-dependencies]
jemallocator = "0.3.2"

View File

@ -1,473 +0,0 @@
use std::collections::HashSet;
use std::collections::btree_map::{BTreeMap, Entry};
use std::error::Error;
use std::io::{Read, Write};
use std::iter::FromIterator;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
use std::{fs, io, sync::mpsc};
use rustyline::{Config, Editor};
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilisearch_core::{Database, DatabaseOptions, Highlight, ProcessedUpdateResult};
use meilisearch_core::settings::Settings;
use meilisearch_schema::FieldId;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
struct IndexCommand {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
/// The csv file path to index, you can also use `-` to specify the standard input.
#[structopt(parse(from_os_str))]
csv_data_path: PathBuf,
/// The path to the settings.
#[structopt(long, parse(from_os_str))]
settings: PathBuf,
#[structopt(long)]
update_group_size: Option<usize>,
#[structopt(long, parse(from_os_str))]
compact_to_path: Option<PathBuf>,
}
#[derive(Debug, StructOpt)]
struct SearchCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
/// Timeout after which the search will return results.
#[structopt(long)]
fetch_timeout_ms: Option<u64>,
/// The number of returned results
#[structopt(short, long, default_value = "10")]
number_results: usize,
/// The number of characters before and after the first match
#[structopt(short = "C", long, default_value = "35")]
char_context: usize,
/// A filter string that can be `!adult` or `adult` to
/// filter documents on this specfied field
#[structopt(short, long)]
filter: Option<String>,
/// Fields that must be displayed.
displayed_fields: Vec<String>,
}
#[derive(Debug, StructOpt)]
struct ShowUpdatesCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
}
#[derive(Debug, StructOpt)]
enum Command {
Index(IndexCommand),
Search(SearchCommand),
ShowUpdates(ShowUpdatesCommand),
}
impl Command {
fn path(&self) -> &Path {
match self {
Command::Index(command) => &command.database_path,
Command::Search(command) => &command.database_path,
Command::ShowUpdates(command) => &command.database_path,
}
}
}
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
struct Document(indexmap::IndexMap<String, String>);
fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dyn Error>> {
let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn =
move |_name: &str, update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(&command.index_uid) {
Some(index) => index,
None => database.create_index(&command.index_uid).unwrap(),
};
database.set_update_callback(Box::new(update_fn));
let db = &database;
let settings = {
let string = fs::read_to_string(&command.settings)?;
let settings: Settings = serde_json::from_str(&string).unwrap();
settings.to_update().unwrap()
};
db.update_write(|w| index.settings_update(w, settings))?;
let mut rdr = if command.csv_data_path.as_os_str() == "-" {
csv::Reader::from_reader(Box::new(io::stdin()) as Box<dyn Read>)
} else {
let file = std::fs::File::open(command.csv_data_path)?;
csv::Reader::from_reader(Box::new(file) as Box<dyn Read>)
};
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut max_update_id = 0;
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let mut additions = index.documents_addition();
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file {
break;
}
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
additions.update_document(document);
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = command.update_group_size {
if i % group_size == 0 {
break;
}
}
}
println!();
let update_id = db.update_write(|w| additions.finalize(w))?;
println!("committing update...");
max_update_id = max_update_id.max(update_id);
println!("committed update {}", update_id);
}
println!("Waiting for update {}", max_update_id);
for id in receiver {
if id == max_update_id {
break;
}
}
println!(
"database created in {:.2?} at: {:?}",
start.elapsed(),
command.database_path
);
if let Some(path) = command.compact_to_path {
fs::create_dir_all(&path)?;
let start = Instant::now();
let _file = database.copy_and_compact_to_path(path.join("data.mdb"))?;
println!(
"database compacted in {:.2?} at: {:?}",
start.elapsed(),
path
);
}
Ok(())
}
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range {
[start, end] => [*start, *end],
_ => unreachable!(),
};
if highlighted {
stdout.set_color(
ColorSpec::new()
.set_fg(Some(Color::Yellow))
.set_underline(true),
)?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for highlight in highlights {
let char_index = highlight.char_index as usize;
let char_length = highlight.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => {
entry.insert(byte_length);
}
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
}
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
///
/// ```no_run
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
///
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
///
/// let (text, matches) = crop_text(&text, matches, 35);
/// ```
fn crop_text(
text: &str,
highlights: impl IntoIterator<Item = Highlight>,
context: usize,
) -> (String, Vec<Highlight>) {
let mut highlights = highlights.into_iter().peekable();
let char_index = highlights
.peek()
.map(|m| m.char_index as usize)
.unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
.map(|highlight| Highlight {
char_index: highlight.char_index - start as u16,
..highlight
})
.collect();
(text, highlights)
}
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let db = &database;
let index = database
.open_index(&command.index_uid)
.expect("Could not find index");
let reader = db.main_read_txn().unwrap();
let schema = index.main.schema(&reader)?;
reader.abort().unwrap();
let schema = schema.ok_or(meilisearch_core::Error::SchemaMissing)?;
let fields = command
.displayed_fields
.iter()
.map(String::as_str)
.collect::<HashSet<_>>();
let config = Config::builder().auto_add_history(true).build();
let mut readline = Editor::<()>::with_config(config);
let _ = readline.load_history("query-history.txt");
for result in readline.iter("Searching for: ") {
match result {
Ok(query) => {
let start_total = Instant::now();
let reader = db.main_read_txn().unwrap();
let ref_index = &index;
let ref_reader = &reader;
let mut builder = index.query_builder();
if let Some(timeout) = command.fetch_timeout_ms {
builder.with_fetch_timeout(Duration::from_millis(timeout));
}
if let Some(ref filter) = command.filter {
let filter = filter.as_str();
let (positive, filter) = if let Some(stripped) = filter.strip_prefix('!') {
(false, stripped)
} else {
(true, filter)
};
let attr = schema
.id(filter)
.expect("Could not find filtered attribute");
builder.with_filter(move |document_id| {
let string: String = ref_index
.document_attribute(ref_reader, document_id, attr)
.unwrap()
.unwrap();
(string == "true") == positive
});
}
let result = builder.query(ref_reader, Some(&query), 0..command.number_results)?;
let mut retrieve_duration = Duration::default();
let number_of_documents = result.documents.len();
for mut doc in result.documents {
doc.highlights
.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now();
let result = index.document::<Document>(&reader, Some(&fields), doc.id);
retrieve_duration += start_retrieve.elapsed();
match result {
Ok(Some(document)) => {
println!("raw-id: {:?}", doc.id);
for (name, text) in document.0 {
print!("{}: ", name);
let attr = schema.id(&name).unwrap();
let highlights = doc
.highlights
.iter()
.filter(|m| FieldId::new(m.attribute) == attr)
.cloned();
let (text, highlights) =
crop_text(&text, highlights, command.char_context);
let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?;
println!();
}
}
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for highlight in doc.highlights {
let attr = FieldId::new(highlight.attribute);
let name = schema.name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!(
"whole documents fields retrieve took {:.2?}",
retrieve_duration
);
eprintln!(
"===== Found {} results in {:.2?} =====",
number_of_documents,
start_total.elapsed()
);
}
Err(err) => {
println!("Error: {:?}", err);
break;
}
}
}
readline.save_history("query-history.txt").unwrap();
Ok(())
}
fn show_updates_command(
command: ShowUpdatesCommand,
database: Database,
) -> Result<(), Box<dyn Error>> {
let db = &database;
let index = database
.open_index(&command.index_uid)
.expect("Could not find index");
let reader = db.update_read_txn().unwrap();
let updates = index.all_updates_status(&reader)?;
println!("{:#?}", updates);
reader.abort().unwrap();
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> {
env_logger::init();
let opt = Command::from_args();
let database = Database::open_or_create(opt.path(), DatabaseOptions::default())?;
match opt {
Command::Index(command) => index_command(command, database),
Command::Search(command) => search_command(command, database),
Command::ShowUpdates(command) => show_updates_command(command, database),
}
}

View File

@ -1,53 +0,0 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::OnceCell;
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use PrefixSetting::{NoPrefix, Prefix};
match query.len() {
0..=4 => {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
}
5..=8 => {
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
}
_ => {
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
}
}
}
pub fn build_prefix_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub fn build_exact_dfa(query: &str) -> DFA {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
builder.build_dfa(query)
}

View File

@ -1,4 +0,0 @@
mod dfa;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};

View File

@ -1,679 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::mem;
use std::ops::Deref;
use std::ops::Range;
use std::rc::Rc;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Instant;
use std::fmt;
use compact_arena::{SmallArena, Idx32, mk_arena};
use log::{debug, error};
use sdset::{Set, SetBuf, exponential_search, SetOperation, Counter, duo::OpBuilder};
use slice_group_by::{GroupBy, GroupByMut};
use meilisearch_types::DocIndex;
use crate::criterion::{Criteria, Context, ContextMut};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::RawDocument;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult, Index, RankedMap, MainReader, Error};
use crate::query_tree::{create_query_tree, traverse_query_tree};
use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey};
use crate::query_tree::Context as QTContext;
#[derive(Debug, Default)]
pub struct SortResult {
pub documents: Vec<Document>,
pub nb_hits: usize,
pub exhaustive_nb_hit: bool,
pub facets: Option<HashMap<String, HashMap<String, usize>>>,
pub exhaustive_facets_count: Option<bool>,
}
#[allow(clippy::too_many_arguments)]
pub fn bucket_sort<'c, FI>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
facets_docids: Option<SetBuf<DocumentId>>,
facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
filter: Option<FI>,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
index: &Index,
) -> MResult<SortResult>
where
FI: Fn(DocumentId) -> bool,
{
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if filter.is_some() {
let distinct = |_| None;
let distinct_size = 1;
return bucket_sort_with_distinct(
reader,
query,
range,
facets_docids,
facet_count_docids,
filter,
distinct,
distinct_size,
criteria,
searchable_attrs,
index,
);
}
let mut result = SortResult::default();
let words_set = index.main.words_fst(reader)?;
let stop_words = index.main.stop_words_fst(reader)?;
let context = QTContext {
words_set,
stop_words,
synonyms: index.synonyms,
postings_lists: index.postings_lists,
prefix_postings_lists: index.prefix_postings_lists_cache,
};
let (operation, mapping) = create_query_tree(reader, &context, query)?;
debug!("operation:\n{:?}", operation);
debug!("mapping:\n{:?}", mapping);
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
match operation {
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Query(query) => { map.insert(query.id, &query.kind); },
}
}
let mut queries_kinds = HashMap::new();
recurs_operation(&mut queries_kinds, &operation);
let QueryResult { mut docids, queries } = traverse_query_tree(reader, &context, &operation)?;
debug!("found {} documents", docids.len());
debug!("number of postings {:?}", queries.len());
if let Some(facets_docids) = facets_docids {
let intersection = sdset::duo::OpBuilder::new(docids.as_ref(), facets_docids.as_set())
.intersection()
.into_set_buf();
docids = Cow::Owned(intersection);
}
if let Some(f) = facet_count_docids {
// hardcoded value, until approximation optimization
result.exhaustive_facets_count = Some(true);
result.facets = Some(facet_count(f, &docids));
}
let before = Instant::now();
mk_arena!(arena);
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
debug!("matches cleaned in {:.02?}", before.elapsed());
let before_bucket_sort = Instant::now();
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
raw_documents.push(raw_document);
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
let before_criterion_loop = Instant::now();
let proximity_count = AtomicUsize::new(0);
let mut groups = vec![raw_documents.as_mut_slice()];
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for mut group in tmp_groups {
let before_criterion_preparation = Instant::now();
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_mapping: &mapping,
documents_fields_counts_store: index.documents_fields_counts,
};
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_mapping: &mapping,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed());
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
let schema = index.main.schema(reader)?.ok_or(Error::SchemaMissing)?;
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
let documents = iter.collect();
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
result.documents = documents;
result.nb_hits = docids.len();
Ok(result)
}
#[allow(clippy::too_many_arguments)]
pub fn bucket_sort_with_distinct<'c, FI, FD>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
facets_docids: Option<SetBuf<DocumentId>>,
facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
filter: Option<FI>,
distinct: FD,
distinct_size: usize,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
index: &Index,
) -> MResult<SortResult>
where
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{
let mut result = SortResult::default();
let mut filtered_count = 0;
let words_set = index.main.words_fst(reader)?;
let stop_words = index.main.stop_words_fst(reader)?;
let context = QTContext {
words_set,
stop_words,
synonyms: index.synonyms,
postings_lists: index.postings_lists,
prefix_postings_lists: index.prefix_postings_lists_cache,
};
let (operation, mapping) = create_query_tree(reader, &context, query)?;
debug!("operation:\n{:?}", operation);
debug!("mapping:\n{:?}", mapping);
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
match operation {
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Query(query) => { map.insert(query.id, &query.kind); },
}
}
let mut queries_kinds = HashMap::new();
recurs_operation(&mut queries_kinds, &operation);
let QueryResult { mut docids, queries } = traverse_query_tree(reader, &context, &operation)?;
debug!("found {} documents", docids.len());
debug!("number of postings {:?}", queries.len());
if let Some(facets_docids) = facets_docids {
let intersection = OpBuilder::new(docids.as_ref(), facets_docids.as_set())
.intersection()
.into_set_buf();
docids = Cow::Owned(intersection);
}
if let Some(f) = facet_count_docids {
// hardcoded value, until approximation optimization
result.exhaustive_facets_count = Some(true);
result.facets = Some(facet_count(f, &docids));
}
let before = Instant::now();
mk_arena!(arena);
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
debug!("matches cleaned in {:.02?}", before.elapsed());
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
raw_documents.push(raw_document);
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
let mut groups = vec![raw_documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(distinct_size);
let mut distinct_raw_offset = 0;
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for mut group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_mapping: &mapping,
documents_fields_counts_store: index.documents_fields_counts,
};
let before_criterion_preparation = Instant::now();
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_mapping: &mapping,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| {
let accepted = (filter)(document.id);
// we only want to count it out the first time we see it
if !accepted {
filtered_count += 1;
}
accepted
})
}
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let mut seen = true;
let key = entry.or_insert_with(|| {
seen = false;
(distinct)(document.id).map(Rc::new)
});
let distinct = match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
// we only want to count the document if it is the first time we see it and
// if it wasn't accepted by distinct
if !seen && !distinct {
filtered_count += 1;
}
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end {
break;
}
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end {
continue 'criteria;
}
}
}
}
// once we classified the documents related to the current
// automatons we save that as the next valid result
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
let schema = index.main.schema(reader)?.ok_or(Error::SchemaMissing)?;
let mut documents = Vec::with_capacity(range.len());
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &filter {
Some(_) => filter_map.remove(&raw_document.id).unwrap_or_else(|| {
error!("error during filtering: expected value for document id {}", &raw_document.id.0);
Default::default()
}),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&raw_document.id).unwrap_or_else(|| {
error!("error during distinct: expected value for document id {}", &raw_document.id.0);
Default::default()
});
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
if documents.len() == range.len() {
break;
}
}
}
}
result.documents = documents;
result.nb_hits = docids.len() - filtered_count;
Ok(result)
}
fn cleanup_bare_matches<'tag, 'txn>(
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
docids: &Set<DocumentId>,
queries: HashMap<PostingsKey, Cow<'txn, Set<DocIndex>>>,
) -> Vec<BareMatch<'tag>>
{
let docidslen = docids.len() as f32;
let mut bare_matches = Vec::new();
for (PostingsKey { query, input, distance, is_exact }, matches) in queries {
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
let pllen = postings_list_view.len() as f32;
if docidslen / pllen >= 0.8 {
let mut offset = 0;
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
let document_id = matches[0].document_id;
if docids.contains(&document_id) {
let range = postings_list_view.range(offset, matches.len());
let posting_list_index = arena.add(range);
let bare_match = BareMatch {
document_id,
query_index: query.id,
distance,
is_exact,
postings_list: posting_list_index,
};
bare_matches.push(bare_match);
}
offset += matches.len();
}
} else {
let mut offset = 0;
for id in docids.as_slice() {
let di = DocIndex { document_id: *id, ..DocIndex::default() };
let pos = exponential_search(&postings_list_view[offset..], &di).unwrap_or_else(|x| x);
offset += pos;
let group = postings_list_view[offset..]
.linear_group_by_key(|m| m.document_id)
.next()
.filter(|matches| matches[0].document_id == *id);
if let Some(matches) = group {
let range = postings_list_view.range(offset, matches.len());
let posting_list_index = arena.add(range);
let bare_match = BareMatch {
document_id: *id,
query_index: query.id,
distance,
is_exact,
postings_list: posting_list_index,
};
bare_matches.push(bare_match);
}
}
}
}
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
bare_matches
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: usize,
pub distance: u8,
pub is_exact: bool,
pub postings_list: Idx32<'tag>,
}
impl fmt::Debug for BareMatch<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BareMatch")
.field("document_id", &self.document_id)
.field("query_index", &self.query_index)
.field("distance", &self.distance)
.field("is_exact", &self.is_exact)
.finish()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch {
pub query_index: usize,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Clone)]
pub enum PostingsListView<'txn> {
Original {
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
},
Rewritten {
input: Rc<[u8]>,
postings_list: SetBuf<DocIndex>,
},
}
impl fmt::Debug for PostingsListView<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PostingsListView")
.field("input", &std::str::from_utf8(&self.input()).unwrap())
.field("postings_list", &self.as_ref())
.finish()
}
}
impl<'txn> PostingsListView<'txn> {
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView::Original { input, postings_list, offset: 0, len }
}
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
PostingsListView::Rewritten { input, postings_list }
}
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
let input = match self {
PostingsListView::Original { input, .. } => input.clone(),
PostingsListView::Rewritten { input, .. } => input.clone(),
};
*self = PostingsListView::rewritten(input, postings_list);
}
pub fn len(&self) -> usize {
match self {
PostingsListView::Original { len, .. } => *len,
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
}
}
pub fn input(&self) -> &[u8] {
match self {
PostingsListView::Original { ref input, .. } => input,
PostingsListView::Rewritten { ref input, .. } => input,
}
}
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
match self {
PostingsListView::Original { input, postings_list, offset, len } => {
assert!(range_offset + range_len <= *len);
PostingsListView::Original {
input: input.clone(),
postings_list: postings_list.clone(),
offset: offset + range_offset,
len: range_len,
}
},
PostingsListView::Rewritten { .. } => {
panic!("Cannot create a range on a rewritten postings list view");
}
}
}
}
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
self
}
}
impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
match *self {
PostingsListView::Original { ref postings_list, offset, len, .. } => {
Set::new_unchecked(&postings_list[offset..offset + len])
},
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
}
}
}
/// sorts documents ids according to user defined ranking rules.
pub fn placeholder_document_sort(
document_ids: &mut [DocumentId],
index: &store::Index,
reader: &MainReader,
ranked_map: &RankedMap
) -> MResult<()> {
use crate::settings::RankingRule;
use std::cmp::Ordering;
enum SortOrder {
Asc,
Desc,
}
if let Some(ranking_rules) = index.main.ranking_rules(reader)? {
let schema = index.main.schema(reader)?
.ok_or(Error::SchemaMissing)?;
// Select custom rules from ranking rules, and map them to custom rules
// containing a field_id
let ranking_rules = ranking_rules.iter().filter_map(|r|
match r {
RankingRule::Asc(name) => schema.id(name).map(|f| (f, SortOrder::Asc)),
RankingRule::Desc(name) => schema.id(name).map(|f| (f, SortOrder::Desc)),
_ => None,
}).collect::<Vec<_>>();
document_ids.sort_unstable_by(|a, b| {
for (field_id, order) in &ranking_rules {
let a_value = ranked_map.get(*a, *field_id);
let b_value = ranked_map.get(*b, *field_id);
let (a, b) = match order {
SortOrder::Asc => (a_value, b_value),
SortOrder::Desc => (b_value, a_value),
};
match a.cmp(&b) {
Ordering::Equal => continue,
ordering => return ordering,
}
}
Ordering::Equal
});
}
Ok(())
}
/// For each entry in facet_docids, calculates the number of documents in the intersection with candidate_docids.
pub fn facet_count(
facet_docids: HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>,
candidate_docids: &Set<DocumentId>,
) -> HashMap<String, HashMap<String, usize>> {
let mut facets_counts = HashMap::with_capacity(facet_docids.len());
for (key, doc_map) in facet_docids {
let mut count_map = HashMap::with_capacity(doc_map.len());
for (_, (value, docids)) in doc_map {
let mut counter = Counter::new();
let op = OpBuilder::new(docids.as_ref(), candidate_docids).intersection();
SetOperation::<DocumentId>::extend_collection(op, &mut counter);
count_map.insert(value.to_string(), counter.0);
}
facets_counts.insert(key, count_map);
}
facets_counts
}

View File

@ -1,37 +0,0 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::SimpleMatch;
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_of_attribute(matches: &[SimpleMatch]) -> usize {
let mut sum_of_attribute = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_of_attribute += group[0].attribute as usize;
}
sum_of_attribute
}
let lhs = sum_of_attribute(&lhs.processed_matches);
let rhs = sum_of_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,16 +0,0 @@
use std::cmp::Ordering;
use crate::RawDocument;
use super::{Criterion, Context};
pub struct DocumentId;
impl Criterion for DocumentId {
fn name(&self) -> &str { "stable document id" }
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = &lhs.id;
let rhs = &rhs.id;
lhs.cmp(rhs)
}
}

View File

@ -1,78 +0,0 @@
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::{HashMap, Entry};
use meilisearch_schema::IndexedPos;
use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::BareMatch;
use super::{Criterion, Context, ContextMut};
pub struct Exactness;
impl Criterion for Exactness {
fn name(&self) -> &str { "exactness" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
let store = ctx.documents_fields_counts_store;
let reader = ctx.reader;
'documents: for doc in documents {
doc.bare_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
// mark the document if we find a "one word field" that matches
let mut fields_counts = HashMap::new();
for group in doc.bare_matches.linear_group_by_key(|bm| bm.query_index) {
for group in group.linear_group_by_key(|bm| bm.is_exact) {
if !group[0].is_exact { break }
for bm in group {
for di in ctx.postings_lists[bm.postings_list].as_ref() {
let attr = IndexedPos(di.attribute);
let count = match fields_counts.entry(attr) {
Entry::Occupied(entry) => *entry.get(),
Entry::Vacant(entry) => {
let count = store.document_field_count(reader, doc.id, attr)?;
*entry.insert(count)
},
};
if count == Some(1) {
doc.contains_one_word_field = true;
continue 'documents
}
}
}
}
}
}
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_exact_query_words += group[0].is_exact as usize;
}
sum_exact_query_words
}
// does it contains a "one word field"
lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse()
// if not, with document contains the more exact words
.then_with(|| {
let lhs = sum_exact_query_words(&lhs.bare_matches);
let rhs = sum_exact_query_words(&rhs.bare_matches);
lhs.cmp(&rhs).reverse()
})
}
}

View File

@ -1,292 +0,0 @@
use std::cmp::{self, Ordering};
use std::collections::HashMap;
use std::ops::Range;
use compact_arena::SmallArena;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::bucket_sort::{SimpleMatch, PostingsListView};
use crate::database::MainT;
use crate::query_tree::QueryId;
use crate::{store, RawDocument, MResult};
mod typo;
mod words;
mod proximity;
mod attribute;
mod words_position;
mod exactness;
mod document_id;
mod sort_by_attr;
pub use self::typo::Typo;
pub use self::words::Words;
pub use self::proximity::Proximity;
pub use self::attribute::Attribute;
pub use self::words_position::WordsPosition;
pub use self::exactness::Exactness;
pub use self::document_id::DocumentId;
pub use self::sort_by_attr::SortByAttr;
pub trait Criterion {
fn name(&self) -> &str;
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
_documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
Ok(())
}
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: &Context<'p, 'tag, 'txn, 'q>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> Ordering;
#[inline]
fn eq<'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: &Context<'p, 'tag, 'txn, 'q>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> bool
{
self.evaluate(ctx, lhs, rhs) == Ordering::Equal
}
}
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
pub reader: &'h heed::RoTxn<'h, MainT>,
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
}
pub struct Context<'p, 'tag, 'txn, 'q> {
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> CriteriaBuilder<'a> {
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder {
inner: Vec::with_capacity(capacity),
}
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
#[allow(clippy::should_implement_trait)]
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where
C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where
C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(Typo)
.add(Words)
.add(Proximity)
.add(Attribute)
.add(WordsPosition)
.add(Exactness)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_mapping: &HashMap<QueryId, Range<usize>>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
if postings_lists[m.postings_list].is_empty() { continue }
let range = query_mapping[&(m.query_index as usize)].clone();
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
document.processed_distances = processed;
}
}
fn prepare_bare_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_mapping: &HashMap<QueryId, Range<usize>>,
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
document.processed_matches = processed.into_vec();
}
}
fn multiword_rewrite_matches(
matches: &mut [SimpleMatch],
query_mapping: &HashMap<QueryId, Range<usize>>,
) -> SetBuf<SimpleMatch>
{
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
let query_index = rep.next().unwrap();
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@ -1,68 +0,0 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::bucket_sort::{SimpleMatch};
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
const MAX_DISTANCE: u16 = 8;
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,129 +0,0 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use meilisearch_schema::{Schema, FieldId};
use crate::{RankedMap, RawDocument};
use super::{Criterion, Context};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilisearch::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(Typo)
/// .add(Words)
/// .add(Proximity)
/// .add(Attribute)
/// .add(WordsPosition)
/// .add(Exactness)
/// .add(custom_ranking)
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
field_id: FieldId,
reversed: bool,
}
impl<'a> SortByAttr<'a> {
pub fn lower_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError> {
SortByAttr::new(ranked_map, schema, attr_name, false)
}
pub fn higher_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError> {
SortByAttr::new(ranked_map, schema, attr_name, true)
}
fn new(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError> {
let field_id = match schema.id(attr_name) {
Some(field_id) => field_id,
None => return Err(SortByAttrError::AttributeNotFound),
};
if !schema.is_ranked(field_id) {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr {
ranked_map,
field_id,
reversed,
})
}
}
impl Criterion for SortByAttr<'_> {
fn name(&self) -> &str {
"sort by attribute"
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.field_id);
let rhs = self.ranked_map.get(rhs.id, self.field_id);
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed {
order.reverse()
} else {
order
}
}
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SortByAttrError {
AttributeNotFound,
AttributeNotRegisteredForRanking,
}
impl fmt::Display for SortByAttrError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use SortByAttrError::*;
match self {
AttributeNotFound => f.write_str("attribute not found in the schema"),
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
}
}
}
impl Error for SortByAttrError {}

View File

@ -1,56 +0,0 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
#[allow(clippy::approx_constant)]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -1,31 +0,0 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -1,37 +0,0 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::bucket_sort::SimpleMatch;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,103 +0,0 @@
use hashbrown::HashMap;
use std::hash::Hash;
pub struct DistinctMap<K> {
inner: HashMap<K, usize>,
limit: usize,
len: usize,
}
impl<K: Hash + Eq> DistinctMap<K> {
pub fn new(limit: usize) -> Self {
DistinctMap {
inner: HashMap::new(),
limit,
len: 0,
}
}
pub fn len(&self) -> usize {
self.len
}
}
pub struct BufferedDistinctMap<'a, K> {
internal: &'a mut DistinctMap<K>,
inner: HashMap<K, usize>,
len: usize,
}
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
BufferedDistinctMap {
internal,
inner: HashMap::new(),
len: 0,
}
}
pub fn register(&mut self, key: K) -> bool {
let internal_seen = self.internal.inner.get(&key).unwrap_or(&0);
let inner_seen = self.inner.entry(key).or_insert(0);
let seen = *internal_seen + *inner_seen;
if seen < self.internal.limit {
*inner_seen += 1;
self.len += 1;
true
} else {
false
}
}
pub fn register_without_key(&mut self) -> bool {
self.len += 1;
true
}
pub fn transfert_to_internal(&mut self) {
for (k, v) in self.inner.drain() {
let value = self.internal.inner.entry(k).or_insert(0);
*value += v;
}
self.internal.len += self.len;
self.len = 0;
}
pub fn len(&self) -> usize {
self.internal.len() + self.len
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy_distinct_map() {
let mut map = DistinctMap::new(2);
let mut buffered = BufferedDistinctMap::new(&mut map);
for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] {
buffered.register(x);
}
buffered.transfert_to_internal();
assert_eq!(map.len(), 8);
let mut map = DistinctMap::new(2);
let mut buffered = BufferedDistinctMap::new(&mut map);
assert_eq!(buffered.register(1), true);
assert_eq!(buffered.register(1), true);
assert_eq!(buffered.register(1), false);
assert_eq!(buffered.register(1), false);
assert_eq!(buffered.register(2), true);
assert_eq!(buffered.register(3), true);
assert_eq!(buffered.register(2), true);
assert_eq!(buffered.register(2), false);
buffered.transfert_to_internal();
assert_eq!(map.len(), 5);
}
}

View File

@ -1,224 +0,0 @@
use crate::serde::{DeserializerError, SerializerError};
use serde_json::Error as SerdeJsonError;
use pest::error::Error as PestError;
use crate::filters::Rule;
use std::{error, fmt, io};
pub use bincode::Error as BincodeError;
pub use fst::Error as FstError;
pub use heed::Error as HeedError;
pub use pest::error as pest_error;
use meilisearch_error::{ErrorCode, Code};
pub type MResult<T> = Result<T, Error>;
#[derive(Debug)]
pub enum Error {
Bincode(bincode::Error),
Deserializer(DeserializerError),
FacetError(FacetError),
FilterParseError(PestError<Rule>),
Fst(fst::Error),
Heed(heed::Error),
IndexAlreadyExists,
Io(io::Error),
MaxFieldsLimitExceeded,
MissingDocumentId,
MissingPrimaryKey,
Schema(meilisearch_schema::Error),
SchemaMissing,
SerdeJson(SerdeJsonError),
Serializer(SerializerError),
VersionMismatch(String),
WordIndexMissing,
}
impl ErrorCode for Error {
fn error_code(&self) -> Code {
use Error::*;
match self {
FacetError(_) => Code::Facet,
FilterParseError(_) => Code::Filter,
IndexAlreadyExists => Code::IndexAlreadyExists,
MissingPrimaryKey => Code::MissingPrimaryKey,
MissingDocumentId => Code::MissingDocumentId,
MaxFieldsLimitExceeded => Code::MaxFieldsLimitExceeded,
Schema(s) => s.error_code(),
WordIndexMissing
| SchemaMissing => Code::InvalidState,
Heed(_)
| Fst(_)
| SerdeJson(_)
| Bincode(_)
| Serializer(_)
| Deserializer(_)
| VersionMismatch(_)
| Io(_) => Code::Internal,
}
}
}
impl From<io::Error> for Error {
fn from(error: io::Error) -> Error {
Error::Io(error)
}
}
impl From<PestError<Rule>> for Error {
fn from(error: PestError<Rule>) -> Error {
Error::FilterParseError(error.renamed_rules(|r| {
let s = match r {
Rule::or => "OR",
Rule::and => "AND",
Rule::not => "NOT",
Rule::string => "string",
Rule::word => "word",
Rule::greater => "field > value",
Rule::less => "field < value",
Rule::eq => "field = value",
Rule::leq => "field <= value",
Rule::geq => "field >= value",
Rule::key => "key",
_ => "other",
};
s.to_string()
}))
}
}
impl From<FacetError> for Error {
fn from(error: FacetError) -> Error {
Error::FacetError(error)
}
}
impl From<meilisearch_schema::Error> for Error {
fn from(error: meilisearch_schema::Error) -> Error {
Error::Schema(error)
}
}
impl From<HeedError> for Error {
fn from(error: HeedError) -> Error {
Error::Heed(error)
}
}
impl From<FstError> for Error {
fn from(error: FstError) -> Error {
Error::Fst(error)
}
}
impl From<SerdeJsonError> for Error {
fn from(error: SerdeJsonError) -> Error {
Error::SerdeJson(error)
}
}
impl From<BincodeError> for Error {
fn from(error: BincodeError) -> Error {
Error::Bincode(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
match error {
SerializerError::DocumentIdNotFound => Error::MissingDocumentId,
e => Error::Serializer(e),
}
}
}
impl From<DeserializerError> for Error {
fn from(error: DeserializerError) -> Error {
Error::Deserializer(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
Bincode(e) => write!(f, "bincode error; {}", e),
Deserializer(e) => write!(f, "deserializer error; {}", e),
FacetError(e) => write!(f, "error processing facet filter: {}", e),
FilterParseError(e) => write!(f, "error parsing filter; {}", e),
Fst(e) => write!(f, "fst error; {}", e),
Heed(e) => write!(f, "heed error; {}", e),
IndexAlreadyExists => write!(f, "index already exists"),
Io(e) => write!(f, "{}", e),
MaxFieldsLimitExceeded => write!(f, "maximum number of fields in a document exceeded"),
MissingDocumentId => write!(f, "document id is missing"),
MissingPrimaryKey => write!(f, "schema cannot be built without a primary key"),
Schema(e) => write!(f, "schema error; {}", e),
SchemaMissing => write!(f, "this index does not have a schema"),
SerdeJson(e) => write!(f, "serde json error; {}", e),
Serializer(e) => write!(f, "serializer error; {}", e),
VersionMismatch(version) => write!(f, "Cannot open database, expected MeiliSearch engine version: {}, current engine version: {}.{}.{}",
version,
env!("CARGO_PKG_VERSION_MAJOR"),
env!("CARGO_PKG_VERSION_MINOR"),
env!("CARGO_PKG_VERSION_PATCH")),
WordIndexMissing => write!(f, "this index does not have a word index"),
}
}
}
impl error::Error for Error {}
struct FilterParseError(PestError<Rule>);
impl fmt::Display for FilterParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use crate::pest_error::LineColLocation::*;
let (line, column) = match self.0.line_col {
Span((line, _), (column, _)) => (line, column),
Pos((line, column)) => (line, column),
};
write!(f, "parsing error on line {} at column {}: {}", line, column, self.0.variant.message())
}
}
#[derive(Debug)]
pub enum FacetError {
EmptyArray,
ParsingError(String),
UnexpectedToken { expected: &'static [&'static str], found: String },
InvalidFormat(String),
AttributeNotFound(String),
AttributeNotSet { expected: Vec<String>, found: String },
InvalidDocumentAttribute(String),
NoAttributesForFaceting,
}
impl FacetError {
pub fn unexpected_token(expected: &'static [&'static str], found: impl ToString) -> FacetError {
FacetError::UnexpectedToken{ expected, found: found.to_string() }
}
pub fn attribute_not_set(expected: Vec<String>, found: impl ToString) -> FacetError {
FacetError::AttributeNotSet{ expected, found: found.to_string() }
}
}
impl fmt::Display for FacetError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use FacetError::*;
match self {
EmptyArray => write!(f, "empty array in facet filter is unspecified behavior"),
ParsingError(msg) => write!(f, "parsing error: {}", msg),
UnexpectedToken { expected, found } => write!(f, "unexpected token {}, expected {}", found, expected.join("or")),
InvalidFormat(found) => write!(f, "invalid facet: {}, facets should be \"facetName:facetValue\"", found),
AttributeNotFound(attr) => write!(f, "unknown {:?} attribute", attr),
AttributeNotSet { found, expected } => write!(f, "`{}` is not set as a faceted attribute. available facet attributes: {}", found, expected.join(", ")),
InvalidDocumentAttribute(attr) => write!(f, "invalid document attribute {}, accepted types: String and [String]", attr),
NoAttributesForFaceting => write!(f, "impossible to perform faceted search, no attributes for faceting are set"),
}
}
}

View File

@ -1,357 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::hash::Hash;
use std::ops::Deref;
use cow_utils::CowUtils;
use either::Either;
use heed::types::{Str, OwnedType};
use indexmap::IndexMap;
use serde_json::Value;
use meilisearch_schema::{FieldId, Schema};
use meilisearch_types::DocumentId;
use crate::database::MainT;
use crate::error::{FacetError, MResult};
use crate::store::BEU16;
/// Data structure used to represent a boolean expression in the form of nested arrays.
/// Values in the outer array are and-ed together, values in the inner arrays are or-ed together.
#[derive(Debug, PartialEq)]
pub struct FacetFilter(Vec<Either<Vec<FacetKey>, FacetKey>>);
impl Deref for FacetFilter {
type Target = Vec<Either<Vec<FacetKey>, FacetKey>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl FacetFilter {
pub fn from_str(
s: &str,
schema: &Schema,
attributes_for_faceting: &[FieldId],
) -> MResult<FacetFilter> {
if attributes_for_faceting.is_empty() {
return Err(FacetError::NoAttributesForFaceting.into());
}
let parsed = serde_json::from_str::<Value>(s).map_err(|e| FacetError::ParsingError(e.to_string()))?;
let mut filter = Vec::new();
match parsed {
Value::Array(and_exprs) => {
if and_exprs.is_empty() {
return Err(FacetError::EmptyArray.into());
}
for expr in and_exprs {
match expr {
Value::String(s) => {
let key = FacetKey::from_str( &s, schema, attributes_for_faceting)?;
filter.push(Either::Right(key));
}
Value::Array(or_exprs) => {
if or_exprs.is_empty() {
return Err(FacetError::EmptyArray.into());
}
let mut inner = Vec::new();
for expr in or_exprs {
match expr {
Value::String(s) => {
let key = FacetKey::from_str( &s, schema, attributes_for_faceting)?;
inner.push(key);
}
bad_value => return Err(FacetError::unexpected_token(&["String"], bad_value).into()),
}
}
filter.push(Either::Left(inner));
}
bad_value => return Err(FacetError::unexpected_token(&["Array", "String"], bad_value).into()),
}
}
Ok(Self(filter))
}
bad_value => Err(FacetError::unexpected_token(&["Array"], bad_value).into()),
}
}
}
#[derive(Debug, Eq, PartialEq, Hash)]
#[repr(C)]
pub struct FacetKey(FieldId, String);
impl FacetKey {
pub fn new(field_id: FieldId, value: String) -> Self {
let value = match value.cow_to_lowercase() {
Cow::Borrowed(_) => value,
Cow::Owned(s) => s,
};
Self(field_id, value)
}
pub fn key(&self) -> FieldId {
self.0
}
pub fn value(&self) -> &str {
&self.1
}
// TODO improve parser
fn from_str(
s: &str,
schema: &Schema,
attributes_for_faceting: &[FieldId],
) -> Result<Self, FacetError> {
let mut split = s.splitn(2, ':');
let key = split
.next()
.ok_or_else(|| FacetError::InvalidFormat(s.to_string()))?
.trim();
let field_id = schema
.id(key)
.ok_or_else(|| FacetError::AttributeNotFound(key.to_string()))?;
if !attributes_for_faceting.contains(&field_id) {
return Err(FacetError::attribute_not_set(
attributes_for_faceting
.iter()
.filter_map(|&id| schema.name(id))
.map(str::to_string)
.collect::<Vec<_>>(),
key))
}
let value = split
.next()
.ok_or_else(|| FacetError::InvalidFormat(s.to_string()))?
.trim();
// unquoting the string if need be:
let mut indices = value.char_indices();
let value = match (indices.next(), indices.last()) {
(Some((s, '\'')), Some((e, '\''))) |
(Some((s, '\"')), Some((e, '\"'))) => value[s + 1..e].to_string(),
_ => value.to_string(),
};
Ok(Self::new(field_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetKey {
type EItem = FacetKey;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let mut buffer = Vec::with_capacity(2 + item.1.len());
let id = BEU16::new(item.key().into());
let id_bytes = OwnedType::bytes_encode(&id)?;
let value_bytes = Str::bytes_encode(item.value())?;
buffer.extend_from_slice(id_bytes.as_ref());
buffer.extend_from_slice(value_bytes.as_ref());
Some(Cow::Owned(buffer))
}
}
impl<'a> heed::BytesDecode<'a> for FacetKey {
type DItem = FacetKey;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (id_bytes, value_bytes) = bytes.split_at(2);
let id = OwnedType::<BEU16>::bytes_decode(id_bytes)?;
let id = id.get().into();
let string = Str::bytes_decode(&value_bytes)?;
Some(FacetKey(id, string.to_string()))
}
}
pub fn add_to_facet_map(
facet_map: &mut HashMap<FacetKey, (String, Vec<DocumentId>)>,
field_id: FieldId,
value: Value,
document_id: DocumentId,
) -> Result<(), FacetError> {
let value = match value {
Value::String(s) => s,
// ignore null
Value::Null => return Ok(()),
value => return Err(FacetError::InvalidDocumentAttribute(value.to_string())),
};
let key = FacetKey::new(field_id, value.clone());
facet_map.entry(key).or_insert_with(|| (value, Vec::new())).1.push(document_id);
Ok(())
}
pub fn facet_map_from_docids(
rtxn: &heed::RoTxn<MainT>,
index: &crate::Index,
document_ids: &[DocumentId],
attributes_for_facetting: &[FieldId],
) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
// A hashmap that ascociate a facet key to a pair containing the original facet attribute
// string with it's case preserved, and a list of document ids for that facet attribute.
let mut facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)> = HashMap::new();
for document_id in document_ids {
for result in index
.documents_fields
.document_fields(rtxn, *document_id)?
{
let (field_id, bytes) = result?;
if attributes_for_facetting.contains(&field_id) {
match serde_json::from_slice(bytes)? {
Value::Array(values) => {
for v in values {
add_to_facet_map(&mut facet_map, field_id, v, *document_id)?;
}
}
v => add_to_facet_map(&mut facet_map, field_id, v, *document_id)?,
};
}
}
}
Ok(facet_map)
}
pub fn facet_map_from_docs(
schema: &Schema,
documents: &HashMap<DocumentId, IndexMap<String, Value>>,
attributes_for_facetting: &[FieldId],
) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
let mut facet_map = HashMap::new();
let attributes_for_facetting = attributes_for_facetting
.iter()
.filter_map(|&id| schema.name(id).map(|name| (id, name)))
.collect::<Vec<_>>();
for (id, document) in documents {
for (field_id, name) in &attributes_for_facetting {
if let Some(value) = document.get(*name) {
match value {
Value::Array(values) => {
for v in values {
add_to_facet_map(&mut facet_map, *field_id, v.clone(), *id)?;
}
}
v => add_to_facet_map(&mut facet_map, *field_id, v.clone(), *id)?,
}
}
}
}
Ok(facet_map)
}
#[cfg(test)]
mod test {
use super::*;
use meilisearch_schema::Schema;
#[test]
fn test_facet_key() {
let mut schema = Schema::default();
let id = schema.insert_with_position("hello").unwrap().0;
let facet_list = [schema.id("hello").unwrap()];
assert_eq!(
FacetKey::from_str("hello:12", &schema, &facet_list).unwrap(),
FacetKey::new(id, "12".to_string())
);
assert_eq!(
FacetKey::from_str("hello:\"foo bar\"", &schema, &facet_list).unwrap(),
FacetKey::new(id, "foo bar".to_string())
);
assert_eq!(
FacetKey::from_str("hello:'foo bar'", &schema, &facet_list).unwrap(),
FacetKey::new(id, "foo bar".to_string())
);
// weird case
assert_eq!(
FacetKey::from_str("hello:blabla:machin", &schema, &facet_list).unwrap(),
FacetKey::new(id, "blabla:machin".to_string())
);
assert_eq!(
FacetKey::from_str("hello:\"\"", &schema, &facet_list).unwrap(),
FacetKey::new(id, "".to_string())
);
assert_eq!(
FacetKey::from_str("hello:'", &schema, &facet_list).unwrap(),
FacetKey::new(id, "'".to_string())
);
assert_eq!(
FacetKey::from_str("hello:''", &schema, &facet_list).unwrap(),
FacetKey::new(id, "".to_string())
);
assert!(FacetKey::from_str("hello", &schema, &facet_list).is_err());
assert!(FacetKey::from_str("toto:12", &schema, &facet_list).is_err());
}
#[test]
fn test_parse_facet_array() {
use either::Either::{Left, Right};
let mut schema = Schema::default();
let _id = schema.insert_with_position("hello").unwrap();
let facet_list = [schema.id("hello").unwrap()];
assert_eq!(
FacetFilter::from_str("[[\"hello:12\"]]", &schema, &facet_list).unwrap(),
FacetFilter(vec![Left(vec![FacetKey(FieldId(0), "12".to_string())])])
);
assert_eq!(
FacetFilter::from_str("[\"hello:12\"]", &schema, &facet_list).unwrap(),
FacetFilter(vec![Right(FacetKey(FieldId(0), "12".to_string()))])
);
assert_eq!(
FacetFilter::from_str("[\"hello:12\", \"hello:13\"]", &schema, &facet_list).unwrap(),
FacetFilter(vec![
Right(FacetKey(FieldId(0), "12".to_string())),
Right(FacetKey(FieldId(0), "13".to_string()))
])
);
assert_eq!(
FacetFilter::from_str("[[\"hello:12\", \"hello:13\"]]", &schema, &facet_list).unwrap(),
FacetFilter(vec![Left(vec![
FacetKey(FieldId(0), "12".to_string()),
FacetKey(FieldId(0), "13".to_string())
])])
);
assert_eq!(
FacetFilter::from_str(
"[[\"hello:12\", \"hello:13\"], \"hello:14\"]",
&schema,
&facet_list
)
.unwrap(),
FacetFilter(vec![
Left(vec![
FacetKey(FieldId(0), "12".to_string()),
FacetKey(FieldId(0), "13".to_string())
]),
Right(FacetKey(FieldId(0), "14".to_string()))
])
);
// invalid array depths
assert!(FacetFilter::from_str(
"[[[\"hello:12\", \"hello:13\"], \"hello:14\"]]",
&schema,
&facet_list
)
.is_err());
assert!(FacetFilter::from_str(
"[[[\"hello:12\", \"hello:13\"]], \"hello:14\"]]",
&schema,
&facet_list
)
.is_err());
assert!(FacetFilter::from_str("\"hello:14\"", &schema, &facet_list).is_err());
// unexisting key
assert!(FacetFilter::from_str("[\"foo:12\"]", &schema, &facet_list).is_err());
// invalid facet key
assert!(FacetFilter::from_str("[\"foo=12\"]", &schema, &facet_list).is_err());
assert!(FacetFilter::from_str("[\"foo12\"]", &schema, &facet_list).is_err());
assert!(FacetFilter::from_str("[\"\"]", &schema, &facet_list).is_err());
// empty array error
assert!(FacetFilter::from_str("[]", &schema, &facet_list).is_err());
assert!(FacetFilter::from_str("[\"hello:12\", []]", &schema, &facet_list).is_err());
}
}

View File

@ -1,276 +0,0 @@
use std::str::FromStr;
use std::cmp::Ordering;
use crate::error::Error;
use crate::{store::Index, DocumentId, MainT};
use heed::RoTxn;
use meilisearch_schema::{FieldId, Schema};
use pest::error::{Error as PestError, ErrorVariant};
use pest::iterators::Pair;
use serde_json::{Value, Number};
use super::parser::Rule;
#[derive(Debug, PartialEq)]
enum ConditionType {
Greater,
Less,
Equal,
LessEqual,
GreaterEqual,
NotEqual,
}
/// We need to infer type when the filter is constructed
/// and match every possible types it can be parsed into.
#[derive(Debug)]
struct ConditionValue<'a> {
string: &'a str,
boolean: Option<bool>,
number: Option<Number>
}
impl<'a> ConditionValue<'a> {
pub fn new(value: &Pair<'a, Rule>) -> Self {
match value.as_rule() {
Rule::string | Rule::word => {
let string = value.as_str();
let boolean = match value.as_str() {
"true" => Some(true),
"false" => Some(false),
_ => None,
};
let number = Number::from_str(value.as_str()).ok();
ConditionValue { string, boolean, number }
},
_ => unreachable!(),
}
}
pub fn as_str(&self) -> &str {
self.string
}
pub fn as_number(&self) -> Option<&Number> {
self.number.as_ref()
}
pub fn as_bool(&self) -> Option<bool> {
self.boolean
}
}
#[derive(Debug)]
pub struct Condition<'a> {
field: FieldId,
condition: ConditionType,
value: ConditionValue<'a>
}
fn get_field_value<'a>(schema: &Schema, pair: Pair<'a, Rule>) -> Result<(FieldId, ConditionValue<'a>), Error> {
let mut items = pair.into_inner();
// lexing ensures that we at least have a key
let key = items.next().unwrap();
let field = schema
.id(key.as_str())
.ok_or_else(|| PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
schema.names().collect::<Vec<_>>().join(", ")
),
},
key.as_span()))?;
let value = ConditionValue::new(&items.next().unwrap());
Ok((field, value))
}
// undefined behavior with big numbers
fn compare_numbers(lhs: &Number, rhs: &Number) -> Option<Ordering> {
match (lhs.as_i64(), lhs.as_u64(), lhs.as_f64(),
rhs.as_i64(), rhs.as_u64(), rhs.as_f64()) {
// i64 u64 f64 i64 u64 f64
(Some(lhs), _, _, Some(rhs), _, _) => lhs.partial_cmp(&rhs),
(_, Some(lhs), _, _, Some(rhs), _) => lhs.partial_cmp(&rhs),
(_, _, Some(lhs), _, _, Some(rhs)) => lhs.partial_cmp(&rhs),
(_, _, _, _, _, _) => None,
}
}
impl<'a> Condition<'a> {
pub fn less(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::Less;
Ok(Self { field, condition, value })
}
pub fn greater(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::Greater;
Ok(Self { field, condition, value })
}
pub fn neq(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::NotEqual;
Ok(Self { field, condition, value })
}
pub fn geq(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::GreaterEqual;
Ok(Self { field, condition, value })
}
pub fn leq(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::LessEqual;
Ok(Self { field, condition, value })
}
pub fn eq(
item: Pair<'a, Rule>,
schema: &'a Schema,
) -> Result<Self, Error> {
let (field, value) = get_field_value(schema, item)?;
let condition = ConditionType::Equal;
Ok(Self { field, condition, value })
}
pub fn test(
&self,
reader: &RoTxn<MainT>,
index: &Index,
document_id: DocumentId,
) -> Result<bool, Error> {
match index.document_attribute::<Value>(reader, document_id, self.field)? {
Some(Value::Array(values)) => Ok(values.iter().any(|v| self.match_value(Some(v)))),
other => Ok(self.match_value(other.as_ref())),
}
}
fn match_value(&self, value: Option<&Value>) -> bool {
match value {
Some(Value::String(s)) => {
let value = self.value.as_str();
match self.condition {
ConditionType::Equal => unicase::eq(value, &s),
ConditionType::NotEqual => !unicase::eq(value, &s),
_ => false
}
},
Some(Value::Number(n)) => {
if let Some(value) = self.value.as_number() {
if let Some(ord) = compare_numbers(&n, value) {
let res = match self.condition {
ConditionType::Equal => ord == Ordering::Equal,
ConditionType::NotEqual => ord != Ordering::Equal,
ConditionType::GreaterEqual => ord != Ordering::Less,
ConditionType::LessEqual => ord != Ordering::Greater,
ConditionType::Greater => ord == Ordering::Greater,
ConditionType::Less => ord == Ordering::Less,
};
return res
}
}
false
},
Some(Value::Bool(b)) => {
if let Some(value) = self.value.as_bool() {
let res = match self.condition {
ConditionType::Equal => *b == value,
ConditionType::NotEqual => *b != value,
_ => false
};
return res
}
false
},
// if field is not supported (or not found), all values are different from it,
// so != should always return true in this case.
_ => self.condition == ConditionType::NotEqual,
}
}
}
#[cfg(test)]
mod test {
use super::*;
use serde_json::Number;
use std::cmp::Ordering;
#[test]
fn test_number_comp() {
// test both u64
let n1 = Number::from(1u64);
let n2 = Number::from(2u64);
assert_eq!(Some(Ordering::Less), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Greater), compare_numbers(&n2, &n1));
let n1 = Number::from(1u64);
let n2 = Number::from(1u64);
assert_eq!(Some(Ordering::Equal), compare_numbers(&n1, &n2));
// test both i64
let n1 = Number::from(1i64);
let n2 = Number::from(2i64);
assert_eq!(Some(Ordering::Less), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Greater), compare_numbers(&n2, &n1));
let n1 = Number::from(1i64);
let n2 = Number::from(1i64);
assert_eq!(Some(Ordering::Equal), compare_numbers(&n1, &n2));
// test both f64
let n1 = Number::from_f64(1f64).unwrap();
let n2 = Number::from_f64(2f64).unwrap();
assert_eq!(Some(Ordering::Less), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Greater), compare_numbers(&n2, &n1));
let n1 = Number::from_f64(1f64).unwrap();
let n2 = Number::from_f64(1f64).unwrap();
assert_eq!(Some(Ordering::Equal), compare_numbers(&n1, &n2));
// test one u64 and one f64
let n1 = Number::from_f64(1f64).unwrap();
let n2 = Number::from(2u64);
assert_eq!(Some(Ordering::Less), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Greater), compare_numbers(&n2, &n1));
// equality
let n1 = Number::from_f64(1f64).unwrap();
let n2 = Number::from(1u64);
assert_eq!(Some(Ordering::Equal), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Equal), compare_numbers(&n2, &n1));
// float is neg
let n1 = Number::from_f64(-1f64).unwrap();
let n2 = Number::from(1u64);
assert_eq!(Some(Ordering::Less), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Greater), compare_numbers(&n2, &n1));
// float is too big
let n1 = Number::from_f64(std::f64::MAX).unwrap();
let n2 = Number::from(1u64);
assert_eq!(Some(Ordering::Greater), compare_numbers(&n1, &n2));
assert_eq!(Some(Ordering::Less), compare_numbers(&n2, &n1));
// misc
let n1 = Number::from_f64(std::f64::MAX).unwrap();
let n2 = Number::from(std::u64::MAX);
assert_eq!(Some(Ordering::Greater), compare_numbers(&n1, &n2));
assert_eq!(Some( Ordering::Less ), compare_numbers(&n2, &n1));
}
}

View File

@ -1,127 +0,0 @@
mod parser;
mod condition;
pub(crate) use parser::Rule;
use std::ops::Not;
use condition::Condition;
use crate::error::Error;
use crate::{DocumentId, MainT, store::Index};
use heed::RoTxn;
use meilisearch_schema::Schema;
use parser::{PREC_CLIMBER, FilterParser};
use pest::iterators::{Pair, Pairs};
use pest::Parser;
type FilterResult<'a> = Result<Filter<'a>, Error>;
#[derive(Debug)]
pub enum Filter<'a> {
Condition(Condition<'a>),
Or(Box<Self>, Box<Self>),
And(Box<Self>, Box<Self>),
Not(Box<Self>),
}
impl<'a> Filter<'a> {
pub fn parse(expr: &'a str, schema: &'a Schema) -> FilterResult<'a> {
let mut lexed = FilterParser::parse(Rule::prgm, expr)?;
Self::build(lexed.next().unwrap().into_inner(), schema)
}
pub fn test(
&self,
reader: &RoTxn<MainT>,
index: &Index,
document_id: DocumentId,
) -> Result<bool, Error> {
use Filter::*;
match self {
Condition(c) => c.test(reader, index, document_id),
Or(lhs, rhs) => Ok(
lhs.test(reader, index, document_id)? || rhs.test(reader, index, document_id)?
),
And(lhs, rhs) => Ok(
lhs.test(reader, index, document_id)? && rhs.test(reader, index, document_id)?
),
Not(op) => op.test(reader, index, document_id).map(bool::not),
}
}
fn build(expression: Pairs<'a, Rule>, schema: &'a Schema) -> FilterResult<'a> {
PREC_CLIMBER.climb(
expression,
|pair: Pair<Rule>| match pair.as_rule() {
Rule::eq => Ok(Filter::Condition(Condition::eq(pair, schema)?)),
Rule::greater => Ok(Filter::Condition(Condition::greater(pair, schema)?)),
Rule::less => Ok(Filter::Condition(Condition::less(pair, schema)?)),
Rule::neq => Ok(Filter::Condition(Condition::neq(pair, schema)?)),
Rule::geq => Ok(Filter::Condition(Condition::geq(pair, schema)?)),
Rule::leq => Ok(Filter::Condition(Condition::leq(pair, schema)?)),
Rule::prgm => Self::build(pair.into_inner(), schema),
Rule::term => Self::build(pair.into_inner(), schema),
Rule::not => Ok(Filter::Not(Box::new(Self::build(
pair.into_inner(),
schema,
)?))),
_ => unreachable!(),
},
|lhs: FilterResult, op: Pair<Rule>, rhs: FilterResult| match op.as_rule() {
Rule::or => Ok(Filter::Or(Box::new(lhs?), Box::new(rhs?))),
Rule::and => Ok(Filter::And(Box::new(lhs?), Box::new(rhs?))),
_ => unreachable!(),
},
)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn invalid_syntax() {
assert!(FilterParser::parse(Rule::prgm, "field : id").is_err());
assert!(FilterParser::parse(Rule::prgm, "field=hello hello").is_err());
assert!(FilterParser::parse(Rule::prgm, "field=hello OR OR").is_err());
assert!(FilterParser::parse(Rule::prgm, "OR field:hello").is_err());
assert!(FilterParser::parse(Rule::prgm, r#"field="hello world"#).is_err());
assert!(FilterParser::parse(Rule::prgm, r#"field='hello world"#).is_err());
assert!(FilterParser::parse(Rule::prgm, "NOT field=").is_err());
assert!(FilterParser::parse(Rule::prgm, "N").is_err());
assert!(FilterParser::parse(Rule::prgm, "(field=1").is_err());
assert!(FilterParser::parse(Rule::prgm, "(field=1))").is_err());
assert!(FilterParser::parse(Rule::prgm, "field=1ORfield=2").is_err());
assert!(FilterParser::parse(Rule::prgm, "field=1 ( OR field=2)").is_err());
assert!(FilterParser::parse(Rule::prgm, "hello world=1").is_err());
assert!(FilterParser::parse(Rule::prgm, "").is_err());
assert!(FilterParser::parse(Rule::prgm, r#"((((((hello=world)))))"#).is_err());
}
#[test]
fn valid_syntax() {
assert!(FilterParser::parse(Rule::prgm, "field = id").is_ok());
assert!(FilterParser::parse(Rule::prgm, "field=id").is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field >= 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field <= 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field="hello world""#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field='hello world'"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field > 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field < 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field < 10 AND NOT field=5"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field < 10 AND NOT field > 7.5"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field=true OR NOT field=5"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"NOT field=true OR NOT field=5"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field='hello world' OR ( NOT field=true OR NOT field=5 )"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field='hello \'worl\'d' OR ( NOT field=true OR NOT field=5 )"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"field="hello \"worl\"d" OR ( NOT field=true OR NOT field=5 )"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"((((((hello=world))))))"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#""foo bar" > 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#""foo bar" = 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"'foo bar' = 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"'foo bar' <= 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"'foo bar' != 10"#).is_ok());
assert!(FilterParser::parse(Rule::prgm, r#"bar != 10"#).is_ok());
}
}

View File

@ -1,28 +0,0 @@
key = _{quoted | word}
value = _{quoted | word}
quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP }
string = {char*}
word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
char = _{ !(PEEK | "\\") ~ ANY
| "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
| "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
condition = _{eq | greater | less | geq | leq | neq}
geq = {key ~ ">=" ~ value}
leq = {key ~ "<=" ~ value}
neq = {key ~ "!=" ~ value}
eq = {key ~ "=" ~ value}
greater = {key ~ ">" ~ value}
less = {key ~ "<" ~ value}
prgm = {SOI ~ expr ~ EOI}
expr = _{ ( term ~ (operation ~ term)* ) }
term = { ("(" ~ expr ~ ")") | condition | not }
operation = _{ and | or }
and = {"AND"}
or = {"OR"}
not = {"NOT" ~ term}
WHITESPACE = _{ " " }

View File

@ -1,12 +0,0 @@
use once_cell::sync::Lazy;
use pest::prec_climber::{Operator, Assoc, PrecClimber};
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
use Assoc::*;
use Rule::*;
pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)])
});
#[derive(Parser)]
#[grammar = "filters/parser/grammar.pest"]
pub struct FilterParser;

View File

@ -1,134 +0,0 @@
use std::cmp::min;
use std::collections::BTreeMap;
use std::ops::{Index, IndexMut};
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len());
assert!(
n <= m,
"the source string must be shorter than the target one"
);
if n == 0 {
return (m as u32, 0);
}
if m == 0 {
return (n as u32, 0);
}
if n == m && source == target {
return (0, m);
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..n + 1 {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..m + 1 {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.iter().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.iter().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in n..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x)
}
}
minimum
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matched_length() {
let query = "Levenste";
let text = "Levenshtein";
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
assert_eq!(dist, 1);
assert_eq!(&text[..length], "Levenshte");
}
#[test]
#[should_panic]
fn matched_length_panic() {
let query = "Levenshtein";
let text = "Levenste";
// this function will panic if source if longer than target
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
}
}

View File

@ -1,203 +0,0 @@
#![allow(clippy::type_complexity)]
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
#[macro_use]
extern crate pest_derive;
mod automaton;
mod bucket_sort;
mod database;
mod distinct_map;
mod error;
mod filters;
mod levenshtein;
mod number;
mod query_builder;
mod query_tree;
mod query_words_mapper;
mod ranked_map;
mod raw_document;
mod reordered_attrs;
pub mod criterion;
pub mod facets;
pub mod raw_indexer;
pub mod serde;
pub mod settings;
pub mod store;
pub mod update;
pub use self::database::{BoxUpdateFn, Database, DatabaseOptions, MainT, UpdateT, MainWriter, MainReader, UpdateWriter, UpdateReader};
pub use self::error::{Error, HeedError, FstError, MResult, pest_error, FacetError};
pub use self::filters::Filter;
pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
pub use meilisearch_schema::Schema;
pub use query_words_mapper::QueryWordsMapper;
pub use query_tree::MAX_QUERY_LEN;
use compact_arena::SmallArena;
use log::{error, trace};
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::TryFrom;
use crate::bucket_sort::PostingsListView;
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::query_tree::{QueryId, QueryKind};
use crate::reordered_attrs::ReorderedAttrs;
type FstSetCow<'a> = fst::Set<Cow<'a, [u8]>>;
type FstMapCow<'a> = fst::Map<Cow<'a, [u8]>>;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<crate::bucket_sort::SimpleMatch>,
}
fn highlights_from_raw_document<'a, 'tag, 'txn>(
raw_document: &RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Vec<Highlight>
{
let mut highlights = Vec::new();
for bm in raw_document.bare_matches.iter() {
let postings_list = &arena[bm.postings_list];
let input = postings_list.input();
let kind = &queries_kinds.get(&bm.query_index);
for di in postings_list.iter() {
let covered_area = match kind {
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
let len = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
u16::try_from(len).unwrap_or(u16::max_value())
},
_ => di.char_length,
};
let attribute = searchable_attrs
.and_then(|sa| sa.reverse(di.attribute))
.unwrap_or(di.attribute);
let attribute = match schema.indexed_pos_to_field_id(attribute) {
Some(field_id) => field_id.0,
None => {
error!("Cannot convert indexed_pos {} to field_id", attribute);
trace!("Schema is compromized; {:?}", schema);
continue
}
};
let highlight = Highlight {
attribute,
char_index: di.char_index,
char_length: covered_area,
};
highlights.push(highlight);
}
}
highlights
}
impl Document {
#[cfg(not(test))]
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
Document { id, highlights: highlights.to_owned() }
}
#[cfg(test)]
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
}
#[cfg(not(test))]
pub fn from_raw<'a, 'tag, 'txn>(
raw_document: RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Document
{
let highlights = highlights_from_raw_document(
&raw_document,
queries_kinds,
arena,
searchable_attrs,
schema,
);
Document { id: raw_document.id, highlights }
}
#[cfg(test)]
pub fn from_raw<'a, 'tag, 'txn>(
raw_document: RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Document
{
use crate::bucket_sort::SimpleMatch;
let highlights = highlights_from_raw_document(
&raw_document,
queries_kinds,
arena,
searchable_attrs,
schema,
);
let mut matches = Vec::new();
for sm in raw_document.processed_matches {
let attribute = searchable_attrs
.and_then(|sa| sa.reverse(sm.attribute))
.unwrap_or(sm.attribute);
let attribute = match schema.indexed_pos_to_field_id(attribute) {
Some(field_id) => field_id.0,
None => {
error!("Cannot convert indexed_pos {} to field_id", attribute);
trace!("Schema is compromized; {:?}", schema);
continue
}
};
matches.push(SimpleMatch { attribute, ..sm });
}
matches.sort_unstable();
Document { id: raw_document.id, highlights, matches }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 12);
}
}

View File

@ -1,120 +0,0 @@
use std::cmp::Ordering;
use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
Null,
}
impl Default for Number {
fn default() -> Self {
Self::Null
}
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let uint_error = match u64::from_str(s) {
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
Err(error) => error,
};
let int_error = match i64::from_str(s) {
Ok(signed) => return Ok(Number::Signed(signed)),
Err(error) => error,
};
let float_error = match f64::from_str(s) {
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
Err(error) => error,
};
Err(ParseNumberError {
uint_error,
int_error,
float_error,
})
}
}
impl PartialEq for Number {
fn eq(&self, other: &Number) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for Number {}
impl PartialOrd for Number {
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Number {
fn cmp(&self, other: &Self) -> Ordering {
use Number::{Float, Signed, Unsigned, Null};
match (*self, *other) {
(Unsigned(a), Unsigned(b)) => a.cmp(&b),
(Unsigned(a), Signed(b)) => {
if b < 0 {
Ordering::Greater
} else {
a.cmp(&(b as u64))
}
}
(Unsigned(a), Float(b)) => (OrderedFloat(a as f64)).cmp(&b),
(Signed(a), Unsigned(b)) => {
if a < 0 {
Ordering::Less
} else {
(a as u64).cmp(&b)
}
}
(Signed(a), Signed(b)) => a.cmp(&b),
(Signed(a), Float(b)) => OrderedFloat(a as f64).cmp(&b),
(Float(a), Unsigned(b)) => a.cmp(&OrderedFloat(b as f64)),
(Float(a), Signed(b)) => a.cmp(&OrderedFloat(b as f64)),
(Float(a), Float(b)) => a.cmp(&b),
(Null, Null) => Ordering::Equal,
(_, Null) => Ordering::Less,
(Null, _) => Ordering::Greater,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,
int_error: ParseIntError,
float_error: ParseFloatError,
}
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(
f,
"can not parse number: {}, {}",
self.uint_error, self.float_error
)
} else {
write!(
f,
"can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error
)
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,573 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::ops::Range;
use std::time::Instant;
use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use log::debug;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation};
use crate::database::MainT;
use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::QueryWordsMapper;
pub const MAX_QUERY_LEN: usize = 10;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
Or(Vec<Operation>),
Query(Query),
}
impl fmt::Debug for Operation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
match op {
Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Or(children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
}
}
pprint_tree(f, self, 0)
}
}
impl Operation {
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::Tolerant(s.to_string()) })
}
fn non_tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::NonTolerant(s.to_string()) })
}
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
Operation::Query(Query { id, prefix, exact: true, kind })
}
}
pub type QueryId = usize;
#[derive(Clone, Eq)]
pub struct Query {
pub id: QueryId,
pub prefix: bool,
pub exact: bool,
pub kind: QueryKind,
}
impl PartialEq for Query {
fn eq(&self, other: &Self) -> bool {
self.prefix == other.prefix && self.kind == other.kind
}
}
impl Hash for Query {
fn hash<H: Hasher>(&self, state: &mut H) {
self.prefix.hash(state);
self.kind.hash(state);
}
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum QueryKind {
Tolerant(String),
NonTolerant(String),
Phrase(Vec<String>),
}
impl fmt::Debug for Query {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Query { id, prefix, kind, .. } = self;
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
match kind {
QueryKind::NonTolerant(word) => {
f.debug_struct(&(prefix + "NonTolerant")).field("id", &id).field("word", &word).finish()
},
QueryKind::Tolerant(word) => {
f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish()
},
QueryKind::Phrase(words) => {
f.debug_struct(&(prefix + "Phrase")).field("id", &id).field("words", &words).finish()
},
}
}
}
#[derive(Debug, Default)]
pub struct PostingsList {
docids: SetBuf<DocumentId>,
matches: SetBuf<DocIndex>,
}
pub struct Context<'a> {
pub words_set: FstSetCow<'a>,
pub stop_words: FstSetCow<'a>,
pub synonyms: store::Synonyms,
pub postings_lists: store::PostingsLists,
pub prefix_postings_lists: store::PrefixPostingsListsCache,
}
fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'a str) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = ctx.postings_lists
.postings_list(reader, left.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let right_freq = ctx.postings_lists
.postings_list(reader, right.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
let words = &words.join(" ");
let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?;
let mut strings = Vec::new();
let mut stream = set.stream();
while let Some(input) = stream.next() {
if let Ok(input) = std::str::from_utf8(input) {
let alts = input.split_ascii_whitespace().map(ToOwned::to_owned).collect();
strings.push(alts);
}
}
Ok(strings)
}
fn create_operation<I, F>(iter: I, f: F) -> Operation
where I: IntoIterator<Item=Operation>,
F: Fn(Vec<Operation>) -> Operation,
{
let mut iter = iter.into_iter();
match (iter.next(), iter.next()) {
(Some(first), None) => first,
(first, second) => f(first.into_iter().chain(second).chain(iter).collect()),
}
}
const MAX_NGRAM: usize = 3;
fn split_query_string<A: AsRef<[u8]>>(s: &str, stop_words: &fst::Set<A>) -> Vec<(usize, String)> {
// TODO: Use global instance instead
Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words))
.analyze(s)
.tokens()
.filter(|t| t.is_word())
.map(|t| t.word.to_string())
.take(MAX_QUERY_LEN)
.enumerate()
.collect()
}
pub fn create_query_tree(
reader: &heed::RoTxn<MainT>,
ctx: &Context,
query: &str,
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{
// TODO: use a shared analyzer instance
let words = split_query_string(query, &ctx.stop_words);
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
fn create_inner(
reader: &heed::RoTxn<MainT>,
ctx: &Context,
mapper: &mut QueryWordsMapper,
words: &[(usize, String)],
) -> MResult<Vec<Operation>>
{
let mut alts = Vec::new();
for ngram in 1..=MAX_NGRAM {
if let Some(group) = words.get(..ngram) {
let mut group_ops = Vec::new();
let tail = &words[ngram..];
let is_last = tail.is_empty();
let mut group_alts = Vec::new();
match group {
[(id, word)] => {
let mut idgen = ((id + 1) * 100)..;
let range = (*id)..id+1;
let phrase = split_best_frequency(reader, ctx, word)?
.map(|ws| {
let id = idgen.next().unwrap();
idgen.next().unwrap();
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
Operation::phrase2(id, is_last, ws)
});
let synonyms = fetch_synonyms(reader, ctx, &[word])?
.into_iter()
.map(|alts| {
let exact = alts.len() == 1;
let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &alts);
let mut idgen = once(id).chain(&mut idgen);
let iter = alts.into_iter().map(|w| {
let id = idgen.next().unwrap();
let kind = QueryKind::NonTolerant(w);
Operation::Query(Query { id, prefix: false, exact, kind })
});
create_operation(iter, Operation::And)
});
let original = Operation::tolerant(*id, is_last, word);
group_alts.push(original);
group_alts.extend(synonyms.chain(phrase));
},
words => {
let id = words[0].0;
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
let range = id..id+ngram;
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
for synonym in fetch_synonyms(reader, ctx, &words)? {
let exact = synonym.len() == 1;
let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &synonym);
let mut idgen = once(id).chain(&mut idgen);
let synonym = synonym.into_iter().map(|s| {
let id = idgen.next().unwrap();
let kind = QueryKind::NonTolerant(s);
Operation::Query(Query { id, prefix: false, exact, kind })
});
group_alts.push(create_operation(synonym, Operation::And));
}
let id = idgen.next().unwrap();
let concat = words.concat();
mapper.declare(range.clone(), id, &[&concat]);
group_alts.push(Operation::non_tolerant(id, is_last, &concat));
}
}
group_ops.push(create_operation(group_alts, Operation::Or));
if !tail.is_empty() {
let tail_ops = create_inner(reader, ctx, mapper, tail)?;
group_ops.push(create_operation(tail_ops, Operation::Or));
}
alts.push(create_operation(group_ops, Operation::And));
}
}
Ok(alts)
}
let alternatives = create_inner(reader, ctx, &mut mapper, &words)?;
let operation = Operation::Or(alternatives);
let mapping = mapper.mapping();
Ok((operation, mapping))
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PostingsKey<'o> {
pub query: &'o Query,
pub input: Vec<u8>,
pub distance: u8,
pub is_exact: bool,
}
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub struct QueryResult<'o, 'txn> {
pub docids: Cow<'txn, Set<DocumentId>>,
pub queries: Postings<'o, 'txn>,
}
pub fn traverse_query_tree<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
tree: &'o Operation,
) -> MResult<QueryResult<'o, 'txn>>
{
fn execute_and<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
cache: &mut Cache<'o, 'txn>,
postings: &mut Postings<'o, 'txn>,
depth: usize,
operations: &'o [Operation],
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
debug!("{:1$}AND", "", depth * 2);
let before = Instant::now();
let mut results = Vec::new();
for op in operations {
if cache.get(op).is_none() {
let docids = match op {
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
};
cache.insert(op, docids);
}
}
for op in operations {
if let Some(docids) = cache.get(op) {
results.push(docids.as_ref());
}
}
let op = sdset::multi::Intersection::new(results);
let docids = op.into_set_buf();
debug!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(Cow::Owned(docids))
}
fn execute_or<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
cache: &mut Cache<'o, 'txn>,
postings: &mut Postings<'o, 'txn>,
depth: usize,
operations: &'o [Operation],
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
debug!("{:1$}OR", "", depth * 2);
let before = Instant::now();
let mut results = Vec::new();
for op in operations {
if cache.get(op).is_none() {
let docids = match op {
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
};
cache.insert(op, docids);
}
}
for op in operations {
if let Some(docids) = cache.get(op) {
results.push(docids.as_ref());
}
}
let op = sdset::multi::Union::new(results);
let docids = op.into_set_buf();
debug!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(Cow::Owned(docids))
}
fn execute_query<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
postings: &mut Postings<'o, 'txn>,
depth: usize,
query: &'o Query,
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
let before = Instant::now();
let Query { prefix, kind, exact, .. } = query;
let docids: Cow<Set<_>> = match kind {
QueryKind::Tolerant(word) => {
if *prefix && word.len() <= 2 {
let prefix = {
let mut array = [0; 4];
let bytes = word.as_bytes();
array[..bytes.len()].copy_from_slice(bytes);
array
};
// We retrieve the cached postings lists for all
// the words that starts with this short prefix.
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
postings.insert(key, result.matches);
let prefix_docids = &result.docids;
// We retrieve the exact postings list for the prefix,
// because we must consider these matches as exact.
let result = ctx.postings_lists.postings_list(reader, word.as_bytes())?.unwrap_or_default();
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
postings.insert(key, result.matches);
let exact_docids = &result.docids;
let before = Instant::now();
let docids = sdset::duo::Union::new(prefix_docids, exact_docids).into_set_buf();
debug!("{:4$}prefix docids ({} and {}) construction took {:.02?}",
"", prefix_docids.len(), exact_docids.len(), before.elapsed(), depth * 2);
Cow::Owned(docids)
} else {
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
let byte = word.as_bytes()[0];
let mut stream = if byte == u8::max_value() {
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
} else {
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
};
let before = Instant::now();
let mut results = Vec::new();
while let Some(input) = stream.next() {
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
let is_exact = *exact && distance == 0 && input.len() == word.len();
results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
postings.insert(key, result.matches);
}
}
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
let before = Instant::now();
let docids = if results.len() > 10 {
let cap = results.iter().map(|dis| dis.len()).sum();
let mut docids = Vec::with_capacity(cap);
for dis in results {
docids.extend_from_slice(&dis);
}
SetBuf::from_dirty(docids)
} else {
let sets = results.iter().map(AsRef::as_ref).collect();
sdset::multi::Union::new(sets).into_set_buf()
};
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
Cow::Owned(docids)
}
},
QueryKind::NonTolerant(word) => {
// TODO support prefix and non-prefix exact DFA
let dfa = build_exact_dfa(word);
let byte = word.as_bytes()[0];
let mut stream = if byte == u8::max_value() {
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
} else {
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
};
let before = Instant::now();
let mut results = Vec::new();
while let Some(input) = stream.next() {
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: *exact };
postings.insert(key, result.matches);
}
}
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
let before = Instant::now();
let docids = if results.len() > 10 {
let cap = results.iter().map(|dis| dis.len()).sum();
let mut docids = Vec::with_capacity(cap);
for dis in results {
docids.extend_from_slice(&dis);
}
SetBuf::from_dirty(docids)
} else {
let sets = results.iter().map(AsRef::as_ref).collect();
sdset::multi::Union::new(sets).into_set_buf()
};
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
Cow::Owned(docids)
},
QueryKind::Phrase(words) => {
// TODO support prefix and non-prefix exact DFA
if let [first, second] = words.as_slice() {
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| {
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
let y = (b.document_id, b.attribute, b.word_index as u32);
x.cmp(&y)
});
let matches: Vec<_> = iter
.filter_map(EitherOrBoth::both)
.flat_map(|(a, b)| once(*a).chain(Some(*b)))
.collect();
let before = Instant::now();
let mut docids: Vec<_> = matches.iter().map(|m| m.document_id).collect();
docids.dedup();
let docids = SetBuf::new(docids).unwrap();
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
let matches = Cow::Owned(SetBuf::from_dirty(matches));
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
postings.insert(key, matches);
Cow::Owned(docids)
} else {
debug!("{:2$}{:?} skipped", "", words, depth * 2);
Cow::default()
}
},
};
debug!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
Ok(docids)
}
let mut cache = Cache::new();
let mut postings = Postings::new();
let docids = match tree {
Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?,
};
Ok(QueryResult { docids, queries: postings })
}

View File

@ -1,416 +0,0 @@
use std::collections::HashMap;
use std::iter::FromIterator;
use std::ops::Range;
use intervaltree::{Element, IntervalTree};
pub type QueryId = usize;
pub struct QueryWordsMapper {
originals: Vec<String>,
mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>,
}
impl QueryWordsMapper {
pub fn new<I, A>(originals: I) -> QueryWordsMapper
where I: IntoIterator<Item = A>,
A: ToString,
{
let originals = originals.into_iter().map(|s| s.to_string()).collect();
QueryWordsMapper { originals, mappings: HashMap::new() }
}
#[allow(clippy::len_zero)]
pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I)
where I: IntoIterator<Item = A>,
A: ToString,
{
assert!(range.len() != 0);
assert!(self.originals.get(range.clone()).is_some());
assert!(id >= self.originals.len());
let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect();
assert!(!replacement.is_empty());
// We detect words at the end and at the front of the
// replacement that are common with the originals:
//
// x a b c d e f g
// ^^^/ \^^^
// a b x c d k j e f
// ^^^ ^^^
//
let left = &self.originals[..range.start];
let right = &self.originals[range.end..];
let common_left = longest_common_prefix(left, &replacement);
let common_right = longest_common_prefix(&replacement, right);
for i in 0..common_left {
let range = range.start - common_left + i..range.start - common_left + i + 1;
let replacement = vec![replacement[i].clone()];
self.mappings.insert(id + i, (range, replacement));
}
{
let replacement = replacement[common_left..replacement.len() - common_right].to_vec();
self.mappings.insert(id + common_left, (range.clone(), replacement));
}
for i in 0..common_right {
let id = id + replacement.len() - common_right + i;
let range = range.end + i..range.end + i + 1;
let replacement = vec![replacement[replacement.len() - common_right + i].clone()];
self.mappings.insert(id, (range, replacement));
}
}
pub fn mapping(self) -> HashMap<QueryId, Range<usize>> {
let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v)));
let intervals = IntervalTree::from_iter(mappings);
let mut output = HashMap::new();
let mut offset = 0;
// We map each original word to the biggest number of
// associated words.
for i in 0..self.originals.len() {
let max = intervals.query_point(i)
.filter_map(|e| {
if e.range.end - 1 == i {
let len = e.value.1.iter().skip(i - e.range.start).count();
if len != 0 { Some(len) } else { None }
} else { None }
})
.max()
.unwrap_or(1);
let range = i + offset..i + offset + max;
output.insert(i, range);
offset += max - 1;
}
// We retrieve the range that each original word
// is mapped to and apply it to each of the words.
for i in 0..self.originals.len() {
let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i);
for Element { range, value: (id, words) } in iter {
// We ask for the complete range mapped to the area we map.
let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start);
let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end);
let range = start..end;
// We map each query id to one word until the last,
// we map it to the remainings words.
let add = range.len() - words.len();
for (j, x) in range.take(words.len()).enumerate() {
let add = if j == words.len() - 1 { add } else { 0 }; // is last?
let range = x..x + 1 + add;
output.insert(id + j, range);
}
}
}
output
}
}
fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize {
let mut best = None;
for i in (0..a.len()).rev() {
let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count();
best = match best {
Some(old) if count > old => Some(count),
Some(_) => break,
None => Some(count),
};
}
best.unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
// new = new york city
builder.declare(0..1, 7, &["new", "york", "city"]);
// ^ 7 8 9
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..2); // york
assert_eq!(mapping[&2], 2..3); // city
assert_eq!(mapping[&3], 3..4); // subway
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 0..1); // new
assert_eq!(mapping[&8], 1..2); // york
assert_eq!(mapping[&9], 2..3); // city
}
#[test]
fn original_unmodified2() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// city subway = new york city underground train
builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]);
// ^ 4 5 6 7 8
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..2); // york
assert_eq!(mapping[&2], 2..3); // city
assert_eq!(mapping[&3], 3..5); // subway
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 3..4); // underground
assert_eq!(mapping[&8], 4..5); // train
}
#[test]
fn original_unmodified3() {
let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"];
// 0 1 2 3 4 5 6 7 8 9 10
let mut builder = QueryWordsMapper::new(&query);
// c d = a b x c d k j e f
builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]);
// ^^ 11 12 13 14 15 16 17 18 19
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // a
assert_eq!(mapping[&1], 1..2); // b
assert_eq!(mapping[&2], 2..3); // x
assert_eq!(mapping[&3], 3..4); // x
assert_eq!(mapping[&4], 4..5); // a
assert_eq!(mapping[&5], 5..6); // b
assert_eq!(mapping[&6], 6..7); // c
assert_eq!(mapping[&7], 7..11); // d
assert_eq!(mapping[&8], 11..12); // e
assert_eq!(mapping[&9], 12..13); // f
assert_eq!(mapping[&10], 13..14); // g
assert_eq!(mapping[&11], 4..5); // a
assert_eq!(mapping[&12], 5..6); // b
assert_eq!(mapping[&13], 6..7); // x
assert_eq!(mapping[&14], 7..8); // c
assert_eq!(mapping[&15], 8..9); // d
assert_eq!(mapping[&16], 9..10); // k
assert_eq!(mapping[&17], 10..11); // j
assert_eq!(mapping[&18], 11..12); // e
assert_eq!(mapping[&19], 12..13); // f
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryWordsMapper::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..3); // york
assert_eq!(mapping[&2], 3..4); // subway
assert_eq!(mapping[&3], 0..1); // new
assert_eq!(mapping[&4], 1..2); // york
assert_eq!(mapping[&5], 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..3); // NY
assert_eq!(mapping[&1], 3..5); // subway
assert_eq!(mapping[&2], 0..1); // new
assert_eq!(mapping[&3], 1..3); // york
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 0..3); // NYC
assert_eq!(mapping[&8], 0..1); // new
assert_eq!(mapping[&9], 1..2); // york
assert_eq!(mapping[&10], 2..3); // city
assert_eq!(mapping[&11], 3..4); // underground
assert_eq!(mapping[&12], 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..3); // NYC
assert_eq!(mapping[&1], 3..4); // subway
assert_eq!(mapping[&2], 0..1); // new
assert_eq!(mapping[&3], 1..2); // york
assert_eq!(mapping[&4], 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..6); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // NYC
assert_eq!(mapping[&1], 1..3); // subway
assert_eq!(mapping[&2], 1..2); // underground
assert_eq!(mapping[&3], 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..7); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
assert_eq!(mapping[&7], 5..6); // underground
assert_eq!(mapping[&8], 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..7); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
assert_eq!(mapping[&7], 5..6); // underground
assert_eq!(mapping[&8], 6..7); // train
assert_eq!(mapping[&9], 0..2); // good
assert_eq!(mapping[&10], 1..5); // NY
assert_eq!(mapping[&11], 2..7); // metro
}
}

View File

@ -1,41 +0,0 @@
use std::io::{Read, Write};
use hashbrown::HashMap;
use meilisearch_schema::FieldId;
use serde::{Deserialize, Serialize};
use crate::{DocumentId, Number};
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(transparent)]
pub struct RankedMap(HashMap<(DocumentId, FieldId), Number>);
impl RankedMap {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn insert(&mut self, document: DocumentId, field: FieldId, number: Number) {
self.0.insert((document, field), number);
}
pub fn remove(&mut self, document: DocumentId, field: FieldId) {
self.0.remove(&(document, field));
}
pub fn get(&self, document: DocumentId, field: FieldId) -> Option<Number> {
self.0.get(&(document, field)).cloned()
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
bincode::deserialize_from(reader).map(RankedMap)
}
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
bincode::serialize_into(writer, &self.0)
}
}

View File

@ -1,51 +0,0 @@
use compact_arena::SmallArena;
use sdset::SetBuf;
use crate::DocIndex;
use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView};
use crate::reordered_attrs::ReorderedAttrs;
pub struct RawDocument<'a, 'tag> {
pub id: crate::DocumentId,
pub bare_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
pub processed_distances: Vec<Option<u8>>,
/// Does this document contains a field
/// with one word that is exactly matching
pub contains_one_word_field: bool,
}
impl<'a, 'tag> RawDocument<'a, 'tag> {
pub fn new<'txn>(
bare_matches: &'a mut [BareMatch<'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
) -> RawDocument<'a, 'tag>
{
if let Some(reordered_attrs) = searchable_attrs {
for bm in bare_matches.iter() {
let postings_list = &postings_lists[bm.postings_list];
let mut rewritten = Vec::new();
for di in postings_list.iter() {
if let Some(attribute) = reordered_attrs.get(di.attribute) {
rewritten.push(DocIndex { attribute, ..*di });
}
}
let new_postings = SetBuf::from_dirty(rewritten);
postings_lists[bm.postings_list].rewrite_with(new_postings);
}
}
bare_matches.sort_unstable_by_key(|m| m.query_index);
RawDocument {
id: bare_matches[0].document_id,
bare_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
contains_one_word_field: false,
}
}
}

View File

@ -1,344 +0,0 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use sdset::SetBuf;
use crate::{DocIndex, DocumentId};
use crate::FstSetCow;
const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<'a, A> {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
analyzer: Analyzer<'a, A>,
}
pub struct Indexed<'a> {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
}
impl<'a, A> RawIndexer<'a, A>
where
A: AsRef<[u8]>
{
pub fn new(stop_words: &'a fst::Set<A>) -> RawIndexer<'a, A> {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(stop_words: &'a fst::Set<A>, limit: usize) -> RawIndexer<A> {
RawIndexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)),
}
}
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
let mut number_of_words = 0;
let analyzed_text = self.analyzer.analyze(text);
for (token_pos, (word_pos, token)) in process_tokens(analyzed_text.tokens()).enumerate() {
let must_continue = index_token(
token,
word_pos,
token_pos,
id,
indexed_pos,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
number_of_words += 1;
if !must_continue {
break;
}
}
number_of_words
}
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I)
where
I: IntoIterator<Item = &'s str>,
{
let mut word_offset = 0;
for text in text_iter.into_iter() {
let current_word_offset = word_offset;
let analyzed_text = self.analyzer.analyze(text);
let tokens = process_tokens(analyzed_text.tokens())
.map(|(i, t)| (i + current_word_offset, t))
.enumerate();
for (token_pos, (word_pos, token)) in tokens {
word_offset = word_pos + 1;
let must_continue = index_token(
token,
word_pos,
token_pos,
id,
indexed_pos,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
}
}
}
pub fn build(self) -> Indexed<'static> {
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
let fst = fst::Set::from_iter(words).unwrap().map_data(Cow::Owned).unwrap();
(id, fst)
})
.collect();
Indexed {
words_doc_indexes,
docs_words,
}
}
}
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| !token.is_word())
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
}
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
}
#[allow(clippy::too_many_arguments)]
fn index_token(
token: Token,
word_pos: usize,
token_pos: usize,
id: DocumentId,
indexed_pos: IndexedPos,
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token_pos >= word_limit {
return false;
}
if !token.is_stopword() {
match token_to_docindex(id, indexed_pos, &token, word_pos) {
Some(docindex) => {
let word = Vec::from(token.word.as_ref());
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
}
None => return false,
}
}
true
}
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option<DocIndex> {
let word_index = u16::try_from(word_index).ok()?;
let char_index = u16::try_from(token.byte_start).ok()?;
let char_length = u16::try_from(token.word.len()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: indexed_pos.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
#[test]
fn test_process_token() {
let text = " 為一包含一千多萬目詞的帶標記平衡語料庫";
let stopwords = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords));
let analyzer = analyzer.analyze(text);
let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect();
assert_eq!(tokens, ["", "", "包含", "一千多万", "目词", "", "", "标记", "平衡", "语料库"]);
}
#[test]
fn strange_apostrophe() {
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
}
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
}
#[test]
fn no_empty_unidecode() {
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "🇯🇵";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some());
}
#[test]
// test sample from 807
fn very_long_text() {
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means &ldquo;requests for example.com/&rdquo;, while '/admin' means &ldquo;requests for example.com/admin&rdquo;. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:&#39;/&#39;:# Rules for all requests that don&#39;t otherwise match....&#39;/sites/default/files&#39;:# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:&#39;/&#39;:passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:&#39;/&#39;:root:&#39;web&#39;passthru:&#39;/app.php&#39;This block will serve requests to / from the web directory in the application, and if a file doesn&rsquo;t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with &ldquo;ms&rdquo; (milliseconds), &ldquo;s&rdquo; (seconds), &ldquo;m&rdquo; (minutes), &ldquo;h&rdquo; (hours), &ldquo;d&rdquo; (days), &ldquo;w&rdquo; (weeks), &ldquo;M&rdquo; (months, 30d) or &ldquo;y&rdquo; (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don&rsquo;t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren&rsquo;t affected, to avoid overlap with the application&rsquo;s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:&#39;/&#39;:passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application&rsquo;s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n ";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"request".to_owned().into_bytes()).is_some());
}
#[test]
fn words_over_index_1000_not_indexed() {
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let mut text = String::with_capacity(5000);
for _ in 0..1000 {
text.push_str("less ");
}
text.push_str("more");
indexer.index_text(docid, indexed_pos, &text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"less".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes.get(&"more".to_owned().into_bytes()).is_none());
}
}

View File

@ -1,31 +0,0 @@
use std::cmp;
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
reorders: Vec<Option<u16>>,
reverse: Vec<u16>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { reorders: Vec::new(), reverse: Vec::new() }
}
pub fn insert_attribute(&mut self, attribute: u16) {
let new_len = cmp::max(attribute as usize + 1, self.reorders.len());
self.reorders.resize(new_len, None);
self.reorders[attribute as usize] = Some(self.reverse.len() as u16);
self.reverse.push(attribute);
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize)? {
Some(attribute) => Some(*attribute),
None => None,
}
}
pub fn reverse(&self, attribute: u16) -> Option<u16> {
self.reverse.get(attribute as usize).copied()
}
}

View File

@ -1,161 +0,0 @@
use std::collections::HashSet;
use std::io::Cursor;
use std::{error::Error, fmt};
use meilisearch_schema::{Schema, FieldId};
use serde::{de, forward_to_deserialize_any};
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::Error as SerdeJsonError;
use crate::database::MainT;
use crate::store::DocumentsFields;
use crate::DocumentId;
#[derive(Debug)]
pub enum DeserializerError {
SerdeJson(SerdeJsonError),
Zlmdb(heed::Error),
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for DeserializerError {}
impl From<SerdeJsonError> for DeserializerError {
fn from(error: SerdeJsonError) -> DeserializerError {
DeserializerError::SerdeJson(error)
}
}
impl From<heed::Error> for DeserializerError {
fn from(error: heed::Error) -> DeserializerError {
DeserializerError::Zlmdb(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub reader: &'a heed::RoTxn<'a, MainT>,
pub documents_fields: DocumentsFields,
pub schema: &'a Schema,
pub fields: Option<&'a HashSet<FieldId>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
self.deserialize_option(visitor)
}
fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
let mut error = None;
let iter = self
.documents_fields
.document_fields(self.reader, self.document_id)?
.filter_map(|result| {
let (attr, value) = match result {
Ok(value) => value,
Err(e) => {
error = Some(e);
return None;
}
};
let is_displayed = self.schema.is_displayed(attr);
if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) {
if let Some(attribute_name) = self.schema.name(attr) {
let cursor = Cursor::new(value.to_owned());
let ioread = SerdeJsonIoRead::new(cursor);
let value = Value(SerdeJsonDeserializer::new(ioread));
Some((attribute_name, value))
} else {
None
}
} else {
None
}
});
let mut iter = iter.peekable();
let result = match iter.peek() {
Some(_) => {
let map_deserializer = de::value::MapDeserializer::new(iter);
visitor
.visit_some(map_deserializer)
.map_err(DeserializerError::from)
}
None => visitor.visit_none(),
};
match error.take() {
Some(error) => Err(error.into()),
None => result,
}
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
}
struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>);
impl<'de> de::IntoDeserializer<'de, SerdeJsonError> for Value {
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
impl<'de> de::Deserializer<'de> for Value {
type Error = SerdeJsonError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
self.0.deserialize_any(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct map struct enum identifier ignored_any
}
}

View File

@ -1,92 +0,0 @@
mod deserializer;
pub use self::deserializer::{Deserializer, DeserializerError};
use std::{error::Error, fmt};
use serde::ser;
use serde_json::Error as SerdeJsonError;
use meilisearch_schema::Error as SchemaError;
use crate::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdFormat,
Zlmdb(heed::Error),
SerdeJson(SerdeJsonError),
ParseNumber(ParseNumberError),
Schema(SchemaError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
f.write_str("Primary key is missing.")
}
SerializerError::InvalidDocumentIdFormat => {
f.write_str("a document primary key can be of type integer or string only composed of alphanumeric characters, hyphens (-) and underscores (_).")
}
SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e)
}
SerializerError::Schema(e) => write!(f, "impossible to update schema: {}", e),
SerializerError::UnserializableType { type_name } => {
write!(f, "{} is not a serializable type", type_name)
}
SerializerError::UnindexableType { type_name } => {
write!(f, "{} is not an indexable type", type_name)
}
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
}
SerializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}
impl From<SerdeJsonError> for SerializerError {
fn from(error: SerdeJsonError) -> SerializerError {
SerializerError::SerdeJson(error)
}
}
impl From<heed::Error> for SerializerError {
fn from(error: heed::Error) -> SerializerError {
SerializerError::Zlmdb(error)
}
}
impl From<ParseNumberError> for SerializerError {
fn from(error: ParseNumberError) -> SerializerError {
SerializerError::ParseNumber(error)
}
}
impl From<SchemaError> for SerializerError {
fn from(error: SchemaError) -> SerializerError {
SerializerError::Schema(error)
}
}

View File

@ -1,183 +0,0 @@
use std::collections::{BTreeMap, BTreeSet};
use std::str::FromStr;
use std::iter::IntoIterator;
use serde::{Deserialize, Deserializer, Serialize};
use once_cell::sync::Lazy;
use self::RankingRule::*;
pub const DEFAULT_RANKING_RULES: [RankingRule; 6] = [Typo, Words, Proximity, Attribute, WordsPosition, Exactness];
static RANKING_RULE_REGEX: Lazy<regex::Regex> = Lazy::new(|| {
regex::Regex::new(r"(asc|desc)\(([a-zA-Z0-9-_]*)\)").unwrap()
});
#[derive(Default, Clone, Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct Settings {
#[serde(default, deserialize_with = "deserialize_some")]
pub ranking_rules: Option<Option<Vec<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub distinct_attribute: Option<Option<String>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub searchable_attributes: Option<Option<Vec<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub displayed_attributes: Option<Option<BTreeSet<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub stop_words: Option<Option<BTreeSet<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub synonyms: Option<Option<BTreeMap<String, Vec<String>>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub attributes_for_faceting: Option<Option<Vec<String>>>,
}
// Any value that is present is considered Some value, including null.
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where T: Deserialize<'de>,
D: Deserializer<'de>
{
Deserialize::deserialize(deserializer).map(Some)
}
impl Settings {
pub fn to_update(&self) -> Result<SettingsUpdate, RankingRuleConversionError> {
let settings = self.clone();
let ranking_rules = match settings.ranking_rules {
Some(Some(rules)) => UpdateState::Update(RankingRule::try_from_iter(rules.iter())?),
Some(None) => UpdateState::Clear,
None => UpdateState::Nothing,
};
Ok(SettingsUpdate {
ranking_rules,
distinct_attribute: settings.distinct_attribute.into(),
primary_key: UpdateState::Nothing,
searchable_attributes: settings.searchable_attributes.into(),
displayed_attributes: settings.displayed_attributes.into(),
stop_words: settings.stop_words.into(),
synonyms: settings.synonyms.into(),
attributes_for_faceting: settings.attributes_for_faceting.into(),
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateState<T> {
Update(T),
Clear,
Nothing,
}
impl <T> From<Option<Option<T>>> for UpdateState<T> {
fn from(opt: Option<Option<T>>) -> UpdateState<T> {
match opt {
Some(Some(t)) => UpdateState::Update(t),
Some(None) => UpdateState::Clear,
None => UpdateState::Nothing,
}
}
}
#[derive(Debug, Clone)]
pub struct RankingRuleConversionError;
impl std::fmt::Display for RankingRuleConversionError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "impossible to convert into RankingRule")
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RankingRule {
Typo,
Words,
Proximity,
Attribute,
WordsPosition,
Exactness,
Asc(String),
Desc(String),
}
impl std::fmt::Display for RankingRule {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
RankingRule::Typo => f.write_str("typo"),
RankingRule::Words => f.write_str("words"),
RankingRule::Proximity => f.write_str("proximity"),
RankingRule::Attribute => f.write_str("attribute"),
RankingRule::WordsPosition => f.write_str("wordsPosition"),
RankingRule::Exactness => f.write_str("exactness"),
RankingRule::Asc(field) => write!(f, "asc({})", field),
RankingRule::Desc(field) => write!(f, "desc({})", field),
}
}
}
impl FromStr for RankingRule {
type Err = RankingRuleConversionError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let rule = match s {
"typo" => RankingRule::Typo,
"words" => RankingRule::Words,
"proximity" => RankingRule::Proximity,
"attribute" => RankingRule::Attribute,
"wordsPosition" => RankingRule::WordsPosition,
"exactness" => RankingRule::Exactness,
_ => {
let captures = RANKING_RULE_REGEX.captures(s).ok_or(RankingRuleConversionError)?;
match (captures.get(1).map(|m| m.as_str()), captures.get(2)) {
(Some("asc"), Some(field)) => RankingRule::Asc(field.as_str().to_string()),
(Some("desc"), Some(field)) => RankingRule::Desc(field.as_str().to_string()),
_ => return Err(RankingRuleConversionError)
}
}
};
Ok(rule)
}
}
impl RankingRule {
pub fn field(&self) -> Option<&str> {
match self {
RankingRule::Asc(field) | RankingRule::Desc(field) => Some(field),
_ => None,
}
}
pub fn try_from_iter(rules: impl IntoIterator<Item = impl AsRef<str>>) -> Result<Vec<RankingRule>, RankingRuleConversionError> {
rules.into_iter()
.map(|s| RankingRule::from_str(s.as_ref()))
.collect()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SettingsUpdate {
pub ranking_rules: UpdateState<Vec<RankingRule>>,
pub distinct_attribute: UpdateState<String>,
pub primary_key: UpdateState<String>,
pub searchable_attributes: UpdateState<Vec<String>>,
pub displayed_attributes: UpdateState<BTreeSet<String>>,
pub stop_words: UpdateState<BTreeSet<String>>,
pub synonyms: UpdateState<BTreeMap<String, Vec<String>>>,
pub attributes_for_faceting: UpdateState<Vec<String>>,
}
impl Default for SettingsUpdate {
fn default() -> Self {
Self {
ranking_rules: UpdateState::Nothing,
distinct_attribute: UpdateState::Nothing,
primary_key: UpdateState::Nothing,
searchable_attributes: UpdateState::Nothing,
displayed_attributes: UpdateState::Nothing,
stop_words: UpdateState::Nothing,
synonyms: UpdateState::Nothing,
attributes_for_faceting: UpdateState::Nothing,
}
}
}

View File

@ -1,32 +0,0 @@
use std::borrow::Cow;
use heed::{types::CowSlice, BytesEncode, BytesDecode};
use sdset::{Set, SetBuf};
use zerocopy::{AsBytes, FromBytes};
pub struct CowSet<T>(std::marker::PhantomData<T>);
impl<'a, T: 'a> BytesEncode<'a> for CowSet<T>
where
T: AsBytes,
{
type EItem = Set<T>;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<[u8]>> {
CowSlice::bytes_encode(item.as_slice())
}
}
impl<'a, T: 'a> BytesDecode<'a> for CowSet<T>
where
T: FromBytes + Copy,
{
type DItem = Cow<'a, Set<T>>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
match CowSlice::<T>::bytes_decode(bytes)? {
Cow::Owned(vec) => Some(Cow::Owned(SetBuf::new_unchecked(vec))),
Cow::Borrowed(slice) => Some(Cow::Borrowed(Set::new_unchecked(slice))),
}
}
}

View File

@ -1,43 +0,0 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::{ByteSlice, OwnedType};
use crate::database::MainT;
use crate::{DocumentId, FstSetCow};
use super::BEU32;
#[derive(Copy, Clone)]
pub struct DocsWords {
pub(crate) docs_words: heed::Database<OwnedType<BEU32>, ByteSlice>,
}
impl DocsWords {
pub fn put_doc_words(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
words: &FstSetCow,
) -> ZResult<()> {
let document_id = BEU32::new(document_id.0);
let bytes = words.as_fst().as_bytes();
self.docs_words.put(writer, &document_id, bytes)
}
pub fn del_doc_words(self, writer: &mut heed::RwTxn<MainT>, document_id: DocumentId) -> ZResult<bool> {
let document_id = BEU32::new(document_id.0);
self.docs_words.delete(writer, &document_id)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.docs_words.clear(writer)
}
pub fn doc_words<'a>(self, reader: &'a heed::RoTxn<'a, MainT>, document_id: DocumentId) -> ZResult<FstSetCow> {
let document_id = BEU32::new(document_id.0);
match self.docs_words.get(reader, &document_id)? {
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
}
}
}

View File

@ -1,79 +0,0 @@
use heed::types::{ByteSlice, OwnedType};
use crate::database::MainT;
use heed::Result as ZResult;
use meilisearch_schema::FieldId;
use super::DocumentFieldStoredKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
pub(crate) documents_fields: heed::Database<OwnedType<DocumentFieldStoredKey>, ByteSlice>,
}
impl DocumentsFields {
pub fn put_document_field(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
field: FieldId,
value: &[u8],
) -> ZResult<()> {
let key = DocumentFieldStoredKey::new(document_id, field);
self.documents_fields.put(writer, &key, value)
}
pub fn del_all_document_fields(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentFieldStoredKey::new(document_id, FieldId::min());
let end = DocumentFieldStoredKey::new(document_id, FieldId::max());
self.documents_fields.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.documents_fields.clear(writer)
}
pub fn document_attribute<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
field: FieldId,
) -> ZResult<Option<&'txn [u8]>> {
let key = DocumentFieldStoredKey::new(document_id, field);
self.documents_fields.get(reader, &key)
}
pub fn document_fields<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentFieldStoredKey::new(document_id, FieldId::min());
let end = DocumentFieldStoredKey::new(document_id, FieldId::max());
let iter = self.documents_fields.range(reader, &(start..=end))?;
Ok(DocumentFieldsIter { iter })
}
}
pub struct DocumentFieldsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentFieldStoredKey>, ByteSlice>,
}
impl<'txn> Iterator for DocumentFieldsIter<'txn> {
type Item = ZResult<(FieldId, &'txn [u8])>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, bytes))) => {
let field_id = FieldId(key.field_id.get());
Some(Ok((field_id, bytes)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -1,143 +0,0 @@
use super::DocumentFieldIndexedKey;
use crate::database::MainT;
use crate::DocumentId;
use heed::types::OwnedType;
use heed::Result as ZResult;
use meilisearch_schema::IndexedPos;
use crate::MResult;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl DocumentsFieldsCounts {
pub fn put_document_field_count(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
attribute: IndexedPos,
value: u16,
) -> ZResult<()> {
let key = DocumentFieldIndexedKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value)
}
pub fn del_all_document_fields_counts(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentFieldIndexedKey::new(document_id, IndexedPos::min());
let end = DocumentFieldIndexedKey::new(document_id, IndexedPos::max());
self.documents_fields_counts.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.documents_fields_counts.clear(writer)
}
pub fn document_field_count(
self,
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: IndexedPos,
) -> ZResult<Option<u16>> {
let key = DocumentFieldIndexedKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)),
None => Ok(None),
}
}
pub fn document_fields_counts<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentFieldIndexedKey::new(document_id, IndexedPos::min());
let end = DocumentFieldIndexedKey::new(document_id, IndexedPos::max());
let iter = self.documents_fields_counts.range(reader, &(start..=end))?;
Ok(DocumentFieldsCountsIter { iter })
}
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> MResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter {
last_seen_id: None,
iter,
})
}
pub fn all_documents_fields_counts<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter })
}
}
pub struct DocumentFieldsCountsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl Iterator for DocumentFieldsCountsIter<'_> {
type Item = ZResult<(IndexedPos, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, count))) => {
let indexed_pos = IndexedPos(key.indexed_pos.get());
Some(Ok((indexed_pos, count)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}
pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>,
iter: heed::RoIter<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl Iterator for DocumentsIdsIter<'_> {
type Item = MResult<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
for result in &mut self.iter {
match result {
Ok((key, _)) => {
let document_id = DocumentId(key.docid.get());
if Some(document_id) != self.last_seen_id {
self.last_seen_id = Some(document_id);
return Some(Ok(document_id));
}
}
Err(e) => return Some(Err(e.into())),
}
}
None
}
}
pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: heed::RoIter<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, IndexedPos, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, count))) => {
let docid = DocumentId(key.docid.get());
let indexed_pos = IndexedPos(key.indexed_pos.get());
Some(Ok((docid, indexed_pos, count)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -1,75 +0,0 @@
use std::borrow::Cow;
use heed::{BytesDecode, BytesEncode};
use sdset::Set;
use crate::DocumentId;
use super::cow_set::CowSet;
pub struct DocumentsIds;
impl BytesEncode<'_> for DocumentsIds {
type EItem = Set<DocumentId>;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
CowSet::bytes_encode(item)
}
}
impl<'a> BytesDecode<'a> for DocumentsIds {
type DItem = Cow<'a, Set<DocumentId>>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
CowSet::bytes_decode(bytes)
}
}
pub struct DiscoverIds<'a> {
ids_iter: std::slice::Iter<'a, DocumentId>,
left_id: Option<u32>,
right_id: Option<u32>,
available_range: std::ops::Range<u32>,
}
impl DiscoverIds<'_> {
pub fn new(ids: &Set<DocumentId>) -> DiscoverIds {
let mut ids_iter = ids.iter();
let right_id = ids_iter.next().map(|id| id.0);
let available_range = 0..right_id.unwrap_or(u32::max_value());
DiscoverIds { ids_iter, left_id: None, right_id, available_range }
}
}
impl Iterator for DiscoverIds<'_> {
type Item = DocumentId;
fn next(&mut self) -> Option<Self::Item> {
loop {
match self.available_range.next() {
// The available range gives us a new id, we return it.
Some(id) => return Some(DocumentId(id)),
// The available range is exhausted, we need to find the next one.
None if self.available_range.end == u32::max_value() => return None,
None => loop {
self.left_id = self.right_id.take();
self.right_id = self.ids_iter.next().map(|id| id.0);
match (self.left_id, self.right_id) {
// We found a gap in the used ids, we can yield all ids
// until the end of the gap
(Some(l), Some(r)) => if l.saturating_add(1) != r {
self.available_range = (l + 1)..r;
break;
},
// The last used id has been reached, we can use all ids
// until u32 MAX
(Some(l), None) => {
self.available_range = l.saturating_add(1)..u32::max_value();
break;
},
_ => (),
}
},
}
}
}
}

View File

@ -1,97 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::mem;
use heed::{RwTxn, RoTxn, RoPrefix, types::Str, BytesEncode, BytesDecode};
use sdset::{SetBuf, Set, SetOperation};
use meilisearch_types::DocumentId;
use meilisearch_schema::FieldId;
use crate::MResult;
use crate::database::MainT;
use crate::facets::FacetKey;
use super::cow_set::CowSet;
/// contains facet info
#[derive(Clone, Copy)]
pub struct Facets {
pub(crate) facets: heed::Database<FacetKey, FacetData>,
}
pub struct FacetData;
impl<'a> BytesEncode<'a> for FacetData {
type EItem = (&'a str, &'a Set<DocumentId>);
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
// get size of the first item
let first_size = item.0.as_bytes().len();
let size = mem::size_of::<u64>()
+ first_size
+ item.1.len() * mem::size_of::<DocumentId>();
let mut buffer = Vec::with_capacity(size);
// encode the length of the first item
buffer.extend_from_slice(&first_size.to_be_bytes());
buffer.extend_from_slice(Str::bytes_encode(&item.0)?.as_ref());
let second_slice = CowSet::bytes_encode(&item.1)?;
buffer.extend_from_slice(second_slice.as_ref());
Some(Cow::Owned(buffer))
}
}
impl<'a> BytesDecode<'a> for FacetData {
type DItem = (&'a str, Cow<'a, Set<DocumentId>>);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
const LEN: usize = mem::size_of::<u64>();
let mut size_buf = [0; LEN];
size_buf.copy_from_slice(bytes.get(0..LEN)?);
// decode size of the first item from the bytes
let first_size = u64::from_be_bytes(size_buf);
// decode first and second items
let first_item = Str::bytes_decode(bytes.get(LEN..(LEN + first_size as usize))?)?;
let second_item = CowSet::bytes_decode(bytes.get((LEN + first_size as usize)..)?)?;
Some((first_item, second_item))
}
}
impl Facets {
// we use sdset::SetBuf to ensure the docids are sorted.
pub fn put_facet_document_ids(&self, writer: &mut RwTxn<MainT>, facet_key: FacetKey, doc_ids: &Set<DocumentId>, facet_value: &str) -> MResult<()> {
Ok(self.facets.put(writer, &facet_key, &(facet_value, doc_ids))?)
}
pub fn field_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, field_id: FieldId) -> MResult<RoPrefix<'txn, FacetKey, FacetData>> {
Ok(self.facets.prefix_iter(reader, &FacetKey::new(field_id, String::new()))?)
}
pub fn facet_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, facet_key: &FacetKey) -> MResult<Option<(&'txn str,Cow<'txn, Set<DocumentId>>)>> {
Ok(self.facets.get(reader, &facet_key)?)
}
/// updates the facets store, revmoving the documents from the facets provided in the
/// `facet_map` argument
pub fn remove(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> MResult<()> {
for (key, (name, document_ids)) in facet_map {
if let Some((_, old)) = self.facets.get(writer, &key)? {
let to_remove = SetBuf::from_dirty(document_ids);
let new = sdset::duo::OpBuilder::new(old.as_ref(), to_remove.as_set()).difference().into_set_buf();
self.facets.put(writer, &key, &(&name, new.as_set()))?;
}
}
Ok(())
}
pub fn add(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> MResult<()> {
for (key, (facet_name, document_ids)) in facet_map {
let set = SetBuf::from_dirty(document_ids);
self.put_facet_document_ids(writer, key, set.as_set(), &facet_name)?;
}
Ok(())
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> MResult<()> {
Ok(self.facets.clear(writer)?)
}
}

View File

@ -1,320 +0,0 @@
use std::borrow::Cow;
use std::collections::BTreeMap;
use chrono::{DateTime, Utc};
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str, CowSlice};
use meilisearch_schema::{FieldId, Schema};
use meilisearch_types::DocumentId;
use sdset::Set;
use crate::database::MainT;
use crate::{RankedMap, MResult};
use crate::settings::RankingRule;
use crate::{FstSetCow, FstMapCow};
use super::{CowSet, DocumentsIds};
const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting";
const CREATED_AT_KEY: &str = "created-at";
const CUSTOMS_KEY: &str = "customs";
const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute";
const EXTERNAL_DOCIDS_KEY: &str = "external-docids";
const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
const INTERNAL_DOCIDS_KEY: &str = "internal-docids";
const NAME_KEY: &str = "name";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const RANKING_RULES_KEY: &str = "ranking-rules";
const SCHEMA_KEY: &str = "schema";
const SORTED_DOCUMENT_IDS_CACHE_KEY: &str = "sorted-document-ids-cache";
const STOP_WORDS_KEY: &str = "stop-words";
const SYNONYMS_KEY: &str = "synonyms";
const UPDATED_AT_KEY: &str = "updated-at";
const WORDS_KEY: &str = "words";
pub type FreqsMap = BTreeMap<String, usize>;
type SerdeFreqsMap = SerdeBincode<FreqsMap>;
type SerdeDatetime = SerdeBincode<DateTime<Utc>>;
#[derive(Copy, Clone)]
pub struct Main {
pub(crate) main: heed::PolyDatabase,
}
impl Main {
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> MResult<()> {
Ok(self.main.clear(writer)?)
}
pub fn put_name(self, writer: &mut heed::RwTxn<MainT>, name: &str) -> MResult<()> {
Ok(self.main.put::<_, Str, Str>(writer, NAME_KEY, name)?)
}
pub fn name(self, reader: &heed::RoTxn<MainT>) -> MResult<Option<String>> {
Ok(self
.main
.get::<_, Str, Str>(reader, NAME_KEY)?
.map(|name| name.to_owned()))
}
pub fn put_created_at(self, writer: &mut heed::RwTxn<MainT>) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeDatetime>(writer, CREATED_AT_KEY, &Utc::now())?)
}
pub fn created_at(self, reader: &heed::RoTxn<MainT>) -> MResult<Option<DateTime<Utc>>> {
Ok(self.main.get::<_, Str, SerdeDatetime>(reader, CREATED_AT_KEY)?)
}
pub fn put_updated_at(self, writer: &mut heed::RwTxn<MainT>) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeDatetime>(writer, UPDATED_AT_KEY, &Utc::now())?)
}
pub fn updated_at(self, reader: &heed::RoTxn<MainT>) -> MResult<Option<DateTime<Utc>>> {
Ok(self.main.get::<_, Str, SerdeDatetime>(reader, UPDATED_AT_KEY)?)
}
pub fn put_internal_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &sdset::Set<DocumentId>) -> MResult<()> {
Ok(self.main.put::<_, Str, DocumentsIds>(writer, INTERNAL_DOCIDS_KEY, ids)?)
}
pub fn internal_docids<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> MResult<Cow<'txn, sdset::Set<DocumentId>>> {
match self.main.get::<_, Str, DocumentsIds>(reader, INTERNAL_DOCIDS_KEY)? {
Some(ids) => Ok(ids),
None => Ok(Cow::default()),
}
}
pub fn merge_internal_docids(self, writer: &mut heed::RwTxn<MainT>, new_ids: &sdset::Set<DocumentId>) -> MResult<()> {
use sdset::SetOperation;
// We do an union of the old and new internal ids.
let internal_docids = self.internal_docids(writer)?;
let internal_docids = sdset::duo::Union::new(&internal_docids, new_ids).into_set_buf();
Ok(self.put_internal_docids(writer, &internal_docids)?)
}
pub fn remove_internal_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &sdset::Set<DocumentId>) -> MResult<()> {
use sdset::SetOperation;
// We do a difference of the old and new internal ids.
let internal_docids = self.internal_docids(writer)?;
let internal_docids = sdset::duo::Difference::new(&internal_docids, ids).into_set_buf();
Ok(self.put_internal_docids(writer, &internal_docids)?)
}
pub fn put_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> MResult<()>
where A: AsRef<[u8]>,
{
Ok(self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes())?)
}
pub fn merge_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map<A>) -> MResult<()>
where A: AsRef<[u8]>,
{
use fst::{Streamer, IntoStreamer};
// Do an union of the old and the new set of external docids.
let external_docids = self.external_docids(writer)?;
let mut op = external_docids.op().add(new_docids.into_stream()).r#union();
let mut build = fst::MapBuilder::memory();
while let Some((docid, values)) = op.next() {
build.insert(docid, values[0].value).unwrap();
}
drop(op);
let external_docids = build.into_map();
Ok(self.put_external_docids(writer, &external_docids)?)
}
pub fn remove_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> MResult<()>
where A: AsRef<[u8]>,
{
use fst::{Streamer, IntoStreamer};
// Do an union of the old and the new set of external docids.
let external_docids = self.external_docids(writer)?;
let mut op = external_docids.op().add(ids.into_stream()).difference();
let mut build = fst::MapBuilder::memory();
while let Some((docid, values)) = op.next() {
build.insert(docid, values[0].value).unwrap();
}
drop(op);
let external_docids = build.into_map();
self.put_external_docids(writer, &external_docids)
}
pub fn external_docids<'a>(self, reader: &'a heed::RoTxn<'a, MainT>) -> MResult<FstMapCow> {
match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? {
Some(bytes) => Ok(fst::Map::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Map::default().map_data(Cow::Owned).unwrap()),
}
}
pub fn external_to_internal_docid(self, reader: &heed::RoTxn<MainT>, external_docid: &str) -> MResult<Option<DocumentId>> {
let external_ids = self.external_docids(reader)?;
Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32)))
}
pub fn words_fst<'a>(self, reader: &'a heed::RoTxn<'a, MainT>) -> MResult<FstSetCow> {
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
}
}
pub fn put_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> MResult<()> {
Ok(self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes())?)
}
pub fn put_sorted_document_ids_cache(self, writer: &mut heed::RwTxn<MainT>, documents_ids: &[DocumentId]) -> MResult<()> {
Ok(self.main.put::<_, Str, CowSlice<DocumentId>>(writer, SORTED_DOCUMENT_IDS_CACHE_KEY, documents_ids)?)
}
pub fn sorted_document_ids_cache<'a>(self, reader: &'a heed::RoTxn<'a, MainT>) -> MResult<Option<Cow<[DocumentId]>>> {
Ok(self.main.get::<_, Str, CowSlice<DocumentId>>(reader, SORTED_DOCUMENT_IDS_CACHE_KEY)?)
}
pub fn put_schema(self, writer: &mut heed::RwTxn<MainT>, schema: &Schema) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)?)
}
pub fn schema(self, reader: &heed::RoTxn<MainT>) -> MResult<Option<Schema>> {
Ok(self.main.get::<_, Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)?)
}
pub fn delete_schema(self, writer: &mut heed::RwTxn<MainT>) -> MResult<bool> {
Ok(self.main.delete::<_, Str>(writer, SCHEMA_KEY)?)
}
pub fn put_ranked_map(self, writer: &mut heed::RwTxn<MainT>, ranked_map: &RankedMap) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)?)
}
pub fn ranked_map(self, reader: &heed::RoTxn<MainT>) -> MResult<Option<RankedMap>> {
Ok(self.main.get::<_, Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)?)
}
pub fn put_synonyms_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> MResult<()> {
let bytes = fst.as_fst().as_bytes();
Ok(self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)?)
}
pub(crate) fn synonyms_fst<'a>(self, reader: &'a heed::RoTxn<'a, MainT>) -> MResult<FstSetCow> {
match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
}
}
pub fn synonyms(self, reader: &heed::RoTxn<MainT>) -> MResult<Vec<String>> {
let synonyms = self
.synonyms_fst(&reader)?
.stream()
.into_strs()?;
Ok(synonyms)
}
pub fn put_stop_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> MResult<()> {
let bytes = fst.as_fst().as_bytes();
Ok(self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)?)
}
pub(crate) fn stop_words_fst<'a>(self, reader: &'a heed::RoTxn<'a, MainT>) -> MResult<FstSetCow> {
match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
}
}
pub fn stop_words(self, reader: &heed::RoTxn<MainT>) -> MResult<Vec<String>> {
let stop_word_list = self
.stop_words_fst(reader)?
.stream()
.into_strs()?;
Ok(stop_word_list)
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn<MainT>, f: F) -> MResult<u64>
where
F: Fn(u64) -> u64,
{
let new = self.number_of_documents(&*writer).map(f)?;
self.main
.put::<_, Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
Ok(new)
}
pub fn number_of_documents(self, reader: &heed::RoTxn<MainT>) -> MResult<u64> {
match self
.main
.get::<_, Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? {
Some(value) => Ok(value),
None => Ok(0),
}
}
pub fn put_fields_distribution(
self,
writer: &mut heed::RwTxn<MainT>,
fields_frequency: &FreqsMap,
) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeFreqsMap>(writer, FIELDS_DISTRIBUTION_KEY, fields_frequency)?)
}
pub fn fields_distribution(&self, reader: &heed::RoTxn<MainT>) -> MResult<Option<FreqsMap>> {
match self
.main
.get::<_, Str, SerdeFreqsMap>(reader, FIELDS_DISTRIBUTION_KEY)?
{
Some(freqs) => Ok(Some(freqs)),
None => Ok(None),
}
}
pub fn attributes_for_faceting<'txn>(&self, reader: &'txn heed::RoTxn<MainT>) -> MResult<Option<Cow<'txn, Set<FieldId>>>> {
Ok(self.main.get::<_, Str, CowSet<FieldId>>(reader, ATTRIBUTES_FOR_FACETING_KEY)?)
}
pub fn put_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>, attributes: &Set<FieldId>) -> MResult<()> {
Ok(self.main.put::<_, Str, CowSet<FieldId>>(writer, ATTRIBUTES_FOR_FACETING_KEY, attributes)?)
}
pub fn delete_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>) -> MResult<bool> {
Ok(self.main.delete::<_, Str>(writer, ATTRIBUTES_FOR_FACETING_KEY)?)
}
pub fn ranking_rules(&self, reader: &heed::RoTxn<MainT>) -> MResult<Option<Vec<RankingRule>>> {
Ok(self.main.get::<_, Str, SerdeBincode<Vec<RankingRule>>>(reader, RANKING_RULES_KEY)?)
}
pub fn put_ranking_rules(self, writer: &mut heed::RwTxn<MainT>, value: &[RankingRule]) -> MResult<()> {
Ok(self.main.put::<_, Str, SerdeBincode<Vec<RankingRule>>>(writer, RANKING_RULES_KEY, &value.to_vec())?)
}
pub fn delete_ranking_rules(self, writer: &mut heed::RwTxn<MainT>) -> MResult<bool> {
Ok(self.main.delete::<_, Str>(writer, RANKING_RULES_KEY)?)
}
pub fn distinct_attribute(&self, reader: &heed::RoTxn<MainT>) -> MResult<Option<FieldId>> {
match self.main.get::<_, Str, OwnedType<u16>>(reader, DISTINCT_ATTRIBUTE_KEY)? {
Some(value) => Ok(Some(FieldId(value.to_owned()))),
None => Ok(None),
}
}
pub fn put_distinct_attribute(self, writer: &mut heed::RwTxn<MainT>, value: FieldId) -> MResult<()> {
Ok(self.main.put::<_, Str, OwnedType<u16>>(writer, DISTINCT_ATTRIBUTE_KEY, &value.0)?)
}
pub fn delete_distinct_attribute(self, writer: &mut heed::RwTxn<MainT>) -> MResult<bool> {
Ok(self.main.delete::<_, Str>(writer, DISTINCT_ATTRIBUTE_KEY)?)
}
pub fn put_customs(self, writer: &mut heed::RwTxn<MainT>, customs: &[u8]) -> MResult<()> {
Ok(self.main.put::<_, Str, ByteSlice>(writer, CUSTOMS_KEY, customs)?)
}
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> MResult<Option<&'txn [u8]>> {
Ok(self.main.get::<_, Str, ByteSlice>(reader, CUSTOMS_KEY)?)
}
}

View File

@ -1,522 +0,0 @@
mod cow_set;
mod docs_words;
mod documents_ids;
mod documents_fields;
mod documents_fields_counts;
mod facets;
mod main;
mod postings_lists;
mod prefix_documents_cache;
mod prefix_postings_lists_cache;
mod synonyms;
mod updates;
mod updates_results;
pub use self::cow_set::CowSet;
pub use self::docs_words::DocsWords;
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
pub use self::documents_fields_counts::{DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter};
pub use self::documents_ids::{DocumentsIds, DiscoverIds};
pub use self::facets::Facets;
pub use self::main::Main;
pub use self::postings_lists::PostingsLists;
pub use self::prefix_documents_cache::PrefixDocumentsCache;
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
pub use self::synonyms::Synonyms;
pub use self::updates::Updates;
pub use self::updates_results::UpdatesResults;
use std::borrow::Cow;
use std::collections::HashSet;
use std::convert::TryInto;
use std::{mem, ptr};
use heed::{BytesEncode, BytesDecode};
use meilisearch_schema::{IndexedPos, FieldId};
use sdset::{Set, SetBuf};
use serde::de::{self, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use crate::criterion::Criteria;
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::serde::Deserializer;
use crate::settings::SettingsUpdate;
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
type BEU32 = zerocopy::U32<byteorder::BigEndian>;
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
pub type BEU16 = zerocopy::U16<byteorder::BigEndian>;
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentFieldIndexedKey {
docid: BEU32,
indexed_pos: BEU16,
}
impl DocumentFieldIndexedKey {
fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey {
DocumentFieldIndexedKey {
docid: BEU32::new(docid.0),
indexed_pos: BEU16::new(indexed_pos.0),
}
}
}
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentFieldStoredKey {
docid: BEU32,
field_id: BEU16,
}
impl DocumentFieldStoredKey {
fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey {
DocumentFieldStoredKey {
docid: BEU32::new(docid.0),
field_id: BEU16::new(field_id.0),
}
}
}
#[derive(Default, Debug)]
pub struct Postings<'a> {
pub docids: Cow<'a, Set<DocumentId>>,
pub matches: Cow<'a, Set<DocIndex>>,
}
pub struct PostingsCodec;
impl<'a> BytesEncode<'a> for PostingsCodec {
type EItem = Postings<'a>;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let u64_size = mem::size_of::<u64>();
let docids_size = item.docids.len() * mem::size_of::<DocumentId>();
let matches_size = item.matches.len() * mem::size_of::<DocIndex>();
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
let docids_len = item.docids.len() as u64;
buffer.extend_from_slice(&docids_len.to_be_bytes());
buffer.extend_from_slice(item.docids.as_bytes());
buffer.extend_from_slice(item.matches.as_bytes());
Some(Cow::Owned(buffer))
}
}
fn aligned_to(bytes: &[u8], align: usize) -> bool {
(bytes as *const _ as *const () as usize) % align == 0
}
fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option<Cow<'a, Set<T>>>
where T: Clone + FromBytes
{
match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) {
Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))),
None => {
let len = bytes.len();
let elem_size = mem::size_of::<T>();
// ensure that it is the alignment that is wrong
// and the length is valid
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<T>()) {
let elems = len / elem_size;
let mut vec = Vec::<T>::with_capacity(elems);
unsafe {
let dst = vec.as_mut_ptr() as *mut u8;
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
vec.set_len(elems);
}
return Some(Cow::Owned(SetBuf::new_unchecked(vec)));
}
None
}
}
}
impl<'a> BytesDecode<'a> for PostingsCodec {
type DItem = Postings<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let u64_size = mem::size_of::<u64>();
let docid_size = mem::size_of::<DocumentId>();
let (len_bytes, bytes) = bytes.split_at(u64_size);
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;
let docids_size = docids_len * docid_size;
let docids_bytes = &bytes[..docids_size];
let matches_bytes = &bytes[docids_size..];
let docids = from_bytes_to_set(docids_bytes)?;
let matches = from_bytes_to_set(matches_bytes)?;
Some(Postings { docids, matches })
}
}
fn main_name(name: &str) -> String {
format!("store-{}", name)
}
fn postings_lists_name(name: &str) -> String {
format!("store-{}-postings-lists", name)
}
fn documents_fields_name(name: &str) -> String {
format!("store-{}-documents-fields", name)
}
fn documents_fields_counts_name(name: &str) -> String {
format!("store-{}-documents-fields-counts", name)
}
fn synonyms_name(name: &str) -> String {
format!("store-{}-synonyms", name)
}
fn docs_words_name(name: &str) -> String {
format!("store-{}-docs-words", name)
}
fn prefix_documents_cache_name(name: &str) -> String {
format!("store-{}-prefix-documents-cache", name)
}
fn prefix_postings_lists_cache_name(name: &str) -> String {
format!("store-{}-prefix-postings-lists-cache", name)
}
fn updates_name(name: &str) -> String {
format!("store-{}-updates", name)
}
fn updates_results_name(name: &str) -> String {
format!("store-{}-updates-results", name)
}
fn facets_name(name: &str) -> String {
format!("store-{}-facets", name)
}
#[derive(Clone)]
pub struct Index {
pub main: Main,
pub postings_lists: PostingsLists,
pub documents_fields: DocumentsFields,
pub documents_fields_counts: DocumentsFieldsCounts,
pub facets: Facets,
pub synonyms: Synonyms,
pub docs_words: DocsWords,
pub prefix_documents_cache: PrefixDocumentsCache,
pub prefix_postings_lists_cache: PrefixPostingsListsCache,
pub updates: Updates,
pub updates_results: UpdatesResults,
pub(crate) updates_notifier: UpdateEventsEmitter,
}
impl Index {
pub fn document<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn<MainT>,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>> {
let schema = self.main.schema(reader)?;
let schema = schema.ok_or(Error::SchemaMissing)?;
let attributes = match attributes {
Some(attributes) => Some(attributes.iter().filter_map(|name| schema.id(*name)).collect()),
None => None,
};
let mut deserializer = Deserializer {
document_id,
reader,
documents_fields: self.documents_fields,
schema: &schema,
fields: attributes.as_ref(),
};
Ok(Option::<T>::deserialize(&mut deserializer)?)
}
pub fn document_attribute<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: FieldId,
) -> MResult<Option<T>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
None => Ok(None),
}
}
pub fn document_attribute_bytes<'txn>(
&self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: FieldId,
) -> MResult<Option<&'txn [u8]>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(bytes)),
None => Ok(None),
}
}
pub fn customs_update(&self, writer: &mut heed::RwTxn<UpdateT>, customs: Vec<u8>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
Ok(update::push_customs_update(writer, self.updates, self.updates_results, customs)?)
}
pub fn settings_update(&self, writer: &mut heed::RwTxn<UpdateT>, update: SettingsUpdate) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
Ok(update::push_settings_update(writer, self.updates, self.updates_results, update)?)
}
pub fn documents_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_partial_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new_partial(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_deletion(&self) -> update::DocumentsDeletion {
update::DocumentsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn clear_all(&self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_clear_all(writer, self.updates, self.updates_results)
}
pub fn current_update_id(&self, reader: &heed::RoTxn<UpdateT>) -> MResult<Option<u64>> {
match self.updates.last_update(reader)? {
Some((id, _)) => Ok(Some(id)),
None => Ok(None),
}
}
pub fn update_status(
&self,
reader: &heed::RoTxn<UpdateT>,
update_id: u64,
) -> MResult<Option<update::UpdateStatus>> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn all_updates_status(&self, reader: &heed::RoTxn<UpdateT>) -> MResult<Vec<update::UpdateStatus>> {
let mut updates = Vec::new();
let mut last_update_result_id = 0;
// retrieve all updates results
if let Some((last_id, _)) = self.updates_results.last_update(reader)? {
updates.reserve(last_id as usize);
for id in 0..=last_id {
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
last_update_result_id = id + 1;
}
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update(reader)? {
for id in last_update_result_id..=last_id {
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
}
}
}
Ok(updates)
}
pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(self)
}
pub fn query_builder_with_criteria<'c, 'f, 'd, 'i>(
&'i self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd, 'i> {
QueryBuilder::with_criteria(self, criteria)
}
}
pub fn create(
env: &heed::Env,
update_env: &heed::Env,
name: &str,
updates_notifier: UpdateEventsEmitter,
) -> MResult<Index> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let prefix_documents_cache_name = prefix_documents_cache_name(name);
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
let facets_name = facets_name(name);
// open all the stores
let main = env.create_poly_database(Some(&main_name))?;
let postings_lists = env.create_database(Some(&postings_lists_name))?;
let documents_fields = env.create_database(Some(&documents_fields_name))?;
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
let facets = env.create_database(Some(&facets_name))?;
let synonyms = env.create_database(Some(&synonyms_name))?;
let docs_words = env.create_database(Some(&docs_words_name))?;
let prefix_documents_cache = env.create_database(Some(&prefix_documents_cache_name))?;
let prefix_postings_lists_cache = env.create_database(Some(&prefix_postings_lists_cache_name))?;
let updates = update_env.create_database(Some(&updates_name))?;
let updates_results = update_env.create_database(Some(&updates_results_name))?;
Ok(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
facets: Facets { facets },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
})
}
pub fn open(
env: &heed::Env,
update_env: &heed::Env,
name: &str,
updates_notifier: UpdateEventsEmitter,
) -> MResult<Option<Index>> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let prefix_documents_cache_name = prefix_documents_cache_name(name);
let facets_name = facets_name(name);
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
// open all the stores
let main = match env.open_poly_database(Some(&main_name))? {
Some(main) => main,
None => return Ok(None),
};
let postings_lists = match env.open_database(Some(&postings_lists_name))? {
Some(postings_lists) => postings_lists,
None => return Ok(None),
};
let documents_fields = match env.open_database(Some(&documents_fields_name))? {
Some(documents_fields) => documents_fields,
None => return Ok(None),
};
let documents_fields_counts = match env.open_database(Some(&documents_fields_counts_name))? {
Some(documents_fields_counts) => documents_fields_counts,
None => return Ok(None),
};
let synonyms = match env.open_database(Some(&synonyms_name))? {
Some(synonyms) => synonyms,
None => return Ok(None),
};
let docs_words = match env.open_database(Some(&docs_words_name))? {
Some(docs_words) => docs_words,
None => return Ok(None),
};
let prefix_documents_cache = match env.open_database(Some(&prefix_documents_cache_name))? {
Some(prefix_documents_cache) => prefix_documents_cache,
None => return Ok(None),
};
let facets = match env.open_database(Some(&facets_name))? {
Some(facets) => facets,
None => return Ok(None),
};
let prefix_postings_lists_cache = match env.open_database(Some(&prefix_postings_lists_cache_name))? {
Some(prefix_postings_lists_cache) => prefix_postings_lists_cache,
None => return Ok(None),
};
let updates = match update_env.open_database(Some(&updates_name))? {
Some(updates) => updates,
None => return Ok(None),
};
let updates_results = match update_env.open_database(Some(&updates_results_name))? {
Some(updates_results) => updates_results,
None => return Ok(None),
};
Ok(Some(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
facets: Facets { facets },
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
}))
}
pub fn clear(
writer: &mut heed::RwTxn<MainT>,
update_writer: &mut heed::RwTxn<UpdateT>,
index: &Index,
) -> MResult<()> {
// clear all the stores
index.main.clear(writer)?;
index.postings_lists.clear(writer)?;
index.documents_fields.clear(writer)?;
index.documents_fields_counts.clear(writer)?;
index.synonyms.clear(writer)?;
index.docs_words.clear(writer)?;
index.prefix_documents_cache.clear(writer)?;
index.prefix_postings_lists_cache.clear(writer)?;
index.updates.clear(update_writer)?;
index.updates_results.clear(update_writer)?;
Ok(())
}

View File

@ -1,47 +0,0 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::ByteSlice;
use sdset::{Set, SetBuf};
use slice_group_by::GroupBy;
use crate::database::MainT;
use crate::DocIndex;
use crate::store::{Postings, PostingsCodec};
#[derive(Copy, Clone)]
pub struct PostingsLists {
pub(crate) postings_lists: heed::Database<ByteSlice, PostingsCodec>,
}
impl PostingsLists {
pub fn put_postings_list(
self,
writer: &mut heed::RwTxn<MainT>,
word: &[u8],
matches: &Set<DocIndex>,
) -> ZResult<()> {
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
let matches = Cow::Borrowed(matches);
let postings = Postings { docids, matches };
self.postings_lists.put(writer, word, &postings)
}
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.postings_lists.clear(writer)
}
pub fn postings_list<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
word: &[u8],
) -> ZResult<Option<Postings<'txn>>> {
self.postings_lists.get(reader, word)
}
}

View File

@ -1,80 +0,0 @@
use std::borrow::Cow;
use heed::types::{OwnedType, CowSlice};
use heed::Result as ZResult;
use zerocopy::{AsBytes, FromBytes};
use super::{BEU64, BEU32};
use crate::{DocumentId, Highlight};
use crate::database::MainT;
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct PrefixKey {
prefix: [u8; 4],
index: BEU64,
docid: BEU32,
}
impl PrefixKey {
pub fn new(prefix: [u8; 4], index: u64, docid: u32) -> PrefixKey {
PrefixKey {
prefix,
index: BEU64::new(index),
docid: BEU32::new(docid),
}
}
}
#[derive(Copy, Clone)]
pub struct PrefixDocumentsCache {
pub(crate) prefix_documents_cache: heed::Database<OwnedType<PrefixKey>, CowSlice<Highlight>>,
}
impl PrefixDocumentsCache {
pub fn put_prefix_document(
self,
writer: &mut heed::RwTxn<MainT>,
prefix: [u8; 4],
index: usize,
docid: DocumentId,
highlights: &[Highlight],
) -> ZResult<()> {
let key = PrefixKey::new(prefix, index as u64, docid.0);
self.prefix_documents_cache.put(writer, &key, highlights)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.prefix_documents_cache.clear(writer)
}
pub fn prefix_documents<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
prefix: [u8; 4],
) -> ZResult<PrefixDocumentsIter<'txn>> {
let start = PrefixKey::new(prefix, 0, 0);
let end = PrefixKey::new(prefix, u64::max_value(), u32::max_value());
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
Ok(PrefixDocumentsIter { iter })
}
}
pub struct PrefixDocumentsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<PrefixKey>, CowSlice<Highlight>>,
}
impl<'txn> Iterator for PrefixDocumentsIter<'txn> {
type Item = ZResult<(DocumentId, Cow<'txn, [Highlight]>)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, highlights))) => {
let docid = DocumentId(key.docid.get());
Some(Ok((docid, highlights)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -1,45 +0,0 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::OwnedType;
use sdset::{Set, SetBuf};
use slice_group_by::GroupBy;
use crate::database::MainT;
use crate::DocIndex;
use crate::store::{PostingsCodec, Postings};
#[derive(Copy, Clone)]
pub struct PrefixPostingsListsCache {
pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, PostingsCodec>,
}
impl PrefixPostingsListsCache {
pub fn put_prefix_postings_list(
self,
writer: &mut heed::RwTxn<MainT>,
prefix: [u8; 4],
matches: &Set<DocIndex>,
) -> ZResult<()>
{
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
let matches = Cow::Borrowed(matches);
let postings = Postings { docids, matches };
self.prefix_postings_lists_cache.put(writer, &prefix, &postings)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.prefix_postings_lists_cache.clear(writer)
}
pub fn prefix_postings_list<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
prefix: [u8; 4],
) -> ZResult<Option<Postings<'txn>>>
{
self.prefix_postings_lists_cache.get(reader, &prefix)
}
}

View File

@ -1,44 +0,0 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::ByteSlice;
use crate::database::MainT;
use crate::{FstSetCow, MResult};
#[derive(Copy, Clone)]
pub struct Synonyms {
pub(crate) synonyms: heed::Database<ByteSlice, ByteSlice>,
}
impl Synonyms {
pub fn put_synonyms<A>(self, writer: &mut heed::RwTxn<MainT>, word: &[u8], synonyms: &fst::Set<A>) -> ZResult<()>
where A: AsRef<[u8]>,
{
let bytes = synonyms.as_fst().as_bytes();
self.synonyms.put(writer, word, bytes)
}
pub fn del_synonyms(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.synonyms.clear(writer)
}
pub(crate) fn synonyms_fst<'txn>(self, reader: &'txn heed::RoTxn<MainT>, word: &[u8]) -> ZResult<FstSetCow<'txn>> {
match self.synonyms.get(reader, word)? {
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
}
}
pub fn synonyms(self, reader: &heed::RoTxn<MainT>, word: &[u8]) -> MResult<Vec<String>> {
let synonyms = self
.synonyms_fst(&reader, word)?
.stream()
.into_strs()?;
Ok(synonyms)
}
}

View File

@ -1,65 +0,0 @@
use super::BEU64;
use crate::database::UpdateT;
use crate::update::Update;
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct Updates {
pub(crate) updates: heed::Database<OwnedType<BEU64>, SerdeJson<Update>>,
}
impl Updates {
// TODO do not trigger deserialize if possible
pub fn last_update(self, reader: &heed::RoTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
}
}
// TODO do not trigger deserialize if possible
pub fn first_update(self, reader: &heed::RoTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
}
}
// TODO do not trigger deserialize if possible
pub fn get(self, reader: &heed::RoTxn<UpdateT>, update_id: u64) -> ZResult<Option<Update>> {
let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id)
}
pub fn put_update(
self,
writer: &mut heed::RwTxn<UpdateT>,
update_id: u64,
update: &Update,
) -> ZResult<()> {
// TODO prefer using serde_json?
let update_id = BEU64::new(update_id);
self.updates.put(writer, &update_id, update)
}
pub fn del_update(self, writer: &mut heed::RwTxn<UpdateT>, update_id: u64) -> ZResult<bool> {
let update_id = BEU64::new(update_id);
self.updates.delete(writer, &update_id)
}
pub fn pop_front(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.first_update(writer)? {
Some((update_id, update)) => {
let key = BEU64::new(update_id);
self.updates.delete(writer, &key)?;
Ok(Some((update_id, update)))
}
None => Ok(None),
}
}
pub fn clear(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<()> {
self.updates.clear(writer)
}
}

View File

@ -1,45 +0,0 @@
use super::BEU64;
use crate::database::UpdateT;
use crate::update::ProcessedUpdateResult;
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
pub(crate) updates_results: heed::Database<OwnedType<BEU64>, SerdeJson<ProcessedUpdateResult>>,
}
impl UpdatesResults {
pub fn last_update(
self,
reader: &heed::RoTxn<UpdateT>,
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
}
}
pub fn put_update_result(
self,
writer: &mut heed::RwTxn<UpdateT>,
update_id: u64,
update_result: &ProcessedUpdateResult,
) -> ZResult<()> {
let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result)
}
pub fn update_result(
self,
reader: &heed::RoTxn<UpdateT>,
update_id: u64,
) -> ZResult<Option<ProcessedUpdateResult>> {
let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id)
}
pub fn clear(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<()> {
self.updates_results.clear(writer)
}
}

View File

@ -1,36 +0,0 @@
use crate::database::{MainT, UpdateT};
use crate::update::{next_update_id, Update};
use crate::{store, MResult, RankedMap};
pub fn apply_clear_all(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
) -> MResult<()> {
index.main.put_words_fst(writer, &fst::Set::default())?;
index.main.put_external_docids(writer, &fst::Map::default())?;
index.main.put_internal_docids(writer, &sdset::SetBuf::default())?;
index.main.put_ranked_map(writer, &RankedMap::default())?;
index.main.put_number_of_documents(writer, |_| 0)?;
index.main.put_sorted_document_ids_cache(writer, &[])?;
index.documents_fields.clear(writer)?;
index.documents_fields_counts.clear(writer)?;
index.postings_lists.clear(writer)?;
index.docs_words.clear(writer)?;
index.prefix_documents_cache.clear(writer)?;
index.prefix_postings_lists_cache.clear(writer)?;
index.facets.clear(writer)?;
Ok(())
}
pub fn push_clear_all(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::clear_all();
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,26 +0,0 @@
use crate::database::{MainT, UpdateT};
use crate::{store, MResult};
use crate::update::{next_update_id, Update};
pub fn apply_customs_update(
writer: &mut heed::RwTxn<MainT>,
main_store: store::Main,
customs: &[u8],
) -> MResult<()> {
main_store.put_customs(writer, customs)
}
pub fn push_customs_update(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
customs: Vec<u8>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::customs(customs);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,444 +0,0 @@
use std::borrow::Cow;
use std::collections::{HashMap, BTreeMap};
use fst::{set::OpBuilder, SetBuilder};
use indexmap::IndexMap;
use meilisearch_schema::{Schema, FieldId};
use meilisearch_types::DocumentId;
use sdset::{duo::Union, SetOperation};
use serde::Deserialize;
use serde_json::Value;
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::facets;
use crate::raw_indexer::RawIndexer;
use crate::serde::Deserializer;
use crate::store::{self, DocumentsFields, DocumentsFieldsCounts, DiscoverIds};
use crate::update::helpers::{index_value, value_to_number, extract_document_id};
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
// Whether the user explicitly set the primary key in the update
primary_key: Option<String>,
documents: Vec<D>,
is_partial: bool,
}
impl<D> DocumentsAddition<D> {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: false,
primary_key: None,
}
}
pub fn new_partial(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: true,
primary_key: None,
}
}
pub fn set_primary_key(&mut self, primary_key: String) {
self.primary_key = Some(primary_key);
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64>
where
D: serde::Serialize,
{
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_documents_addition(
writer,
self.updates_store,
self.updates_results_store,
self.documents,
self.is_partial,
self.primary_key,
)?;
Ok(update_id)
}
}
impl<D> Extend<D> for DocumentsAddition<D> {
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
is_partial: bool,
primary_key: Option<String>,
) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = serde_json::to_vec(&add)?;
let add = serde_json::from_slice(&vec)?;
values.push(add);
}
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = if is_partial {
Update::documents_partial(primary_key, values)
} else {
Update::documents_addition(primary_key, values)
};
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
#[allow(clippy::too_many_arguments)]
fn index_document<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
documents_fields: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
ranked_map: &mut RankedMap,
indexer: &mut RawIndexer<A>,
schema: &Schema,
field_id: FieldId,
document_id: DocumentId,
value: &Value,
) -> MResult<()>
{
let serialized = serde_json::to_vec(value)?;
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
if let Some(indexed_pos) = schema.is_searchable(field_id) {
let number_of_words = index_value(indexer, document_id, indexed_pos, value);
if let Some(number_of_words) = number_of_words {
documents_fields_counts.put_document_field_count(
writer,
document_id,
indexed_pos,
number_of_words as u16,
)?;
}
}
if schema.is_ranked(field_id) {
let number = value_to_number(value).unwrap_or_default();
ranked_map.insert(document_id, field_id, number);
}
Ok(())
}
pub fn apply_addition(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
new_documents: Vec<IndexMap<String, Value>>,
partial: bool,
primary_key: Option<String>,
) -> MResult<()>
{
let mut schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
// Retrieve the documents ids related structures
let external_docids = index.main.external_docids(writer)?;
let internal_docids = index.main.internal_docids(writer)?;
let mut available_ids = DiscoverIds::new(&internal_docids);
let primary_key = match schema.primary_key() {
Some(primary_key) => primary_key.to_string(),
None => {
let name = primary_key.ok_or(Error::MissingPrimaryKey)?;
schema.set_primary_key(&name)?;
name
}
};
// 1. store documents ids for future deletion
let mut documents_additions = HashMap::new();
let mut new_external_docids = BTreeMap::new();
let mut new_internal_docids = Vec::with_capacity(new_documents.len());
for mut document in new_documents {
let external_docids_get = |docid: &str| {
match (external_docids.get(docid), new_external_docids.get(docid)) {
(_, Some(&id))
| (Some(id), _) => Some(id as u32),
(None, None) => None,
}
};
let (internal_docid, external_docid) =
extract_document_id(
&primary_key,
&document,
&external_docids_get,
&mut available_ids,
)?;
new_external_docids.insert(external_docid, internal_docid.0 as u64);
new_internal_docids.push(internal_docid);
if partial {
let mut deserializer = Deserializer {
document_id: internal_docid,
reader: writer,
documents_fields: index.documents_fields,
schema: &schema,
fields: None,
};
let old_document = Option::<HashMap<String, Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = old_document {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
}
documents_additions.insert(internal_docid, document);
}
// 2. remove the documents postings lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = new_external_docids.iter().map(|(id, _)| id.clone()).collect();
apply_documents_deletion(writer, index, documents_ids)?;
let mut ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let mut indexer = RawIndexer::new(&stop_words);
// For each document in this update
for (document_id, document) in &documents_additions {
// For each key-value pair in the document.
for (attribute, value) in document {
let (field_id, _) = schema.insert_with_position(&attribute)?;
index_document(
writer,
index.documents_fields,
index.documents_fields_counts,
&mut ranked_map,
&mut indexer,
&schema,
field_id,
*document_id,
&value,
)?;
}
}
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
let new_external_docids = fst::Map::from_iter(new_external_docids.iter().map(|(ext, id)| (ext, *id as u64)))?;
let new_internal_docids = sdset::SetBuf::from_dirty(new_internal_docids);
index.main.merge_external_docids(writer, &new_external_docids)?;
index.main.merge_internal_docids(writer, &new_internal_docids)?;
// recompute all facet attributes after document update.
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let docids = index.main.internal_docids(writer)?;
let facet_map = facets::facet_map_from_docids(writer, index, &docids, attributes_for_facetting.as_ref())?;
index.facets.add(writer, facet_map)?;
}
// update is finished; update sorted document id cache with new state
let mut document_ids = index.main.internal_docids(writer)?.to_vec();
super::cache_document_ids_sorted(writer, &ranked_map, index, &mut document_ids)?;
Ok(())
}
pub fn apply_documents_partial_addition(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
new_documents: Vec<IndexMap<String, Value>>,
primary_key: Option<String>,
) -> MResult<()> {
apply_addition(writer, index, new_documents, true, primary_key)
}
pub fn apply_documents_addition(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
new_documents: Vec<IndexMap<String, Value>>,
primary_key: Option<String>,
) -> MResult<()> {
apply_addition(writer, index, new_documents, false, primary_key)
}
pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
let schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = RankedMap::default();
// 1. retrieve all documents ids
let mut documents_ids_to_reindex = Vec::new();
for result in index.documents_fields_counts.documents_ids(writer)? {
let document_id = result?;
documents_ids_to_reindex.push(document_id);
}
// 2. remove the documents posting lists
index.main.put_words_fst(writer, &fst::Set::default())?;
index.main.put_ranked_map(writer, &ranked_map)?;
index.main.put_number_of_documents(writer, |_| 0)?;
index.facets.clear(writer)?;
index.postings_lists.clear(writer)?;
index.docs_words.clear(writer)?;
let stop_words = index.main
.stop_words_fst(writer)?
.map_data(Cow::into_owned)
.unwrap();
let number_of_inserted_documents = documents_ids_to_reindex.len();
let mut indexer = RawIndexer::new(&stop_words);
let mut ram_store = HashMap::new();
if let Some(ref attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let facet_map = facets::facet_map_from_docids(writer, &index, &documents_ids_to_reindex, &attributes_for_facetting)?;
index.facets.add(writer, facet_map)?;
}
// ^-- https://github.com/meilisearch/MeiliSearch/pull/631#issuecomment-626624470 --v
for document_id in &documents_ids_to_reindex {
for result in index.documents_fields.document_fields(writer, *document_id)? {
let (field_id, bytes) = result?;
let value: Value = serde_json::from_slice(bytes)?;
ram_store.insert((document_id, field_id), value);
}
// For each key-value pair in the document.
for ((document_id, field_id), value) in ram_store.drain() {
index_document(
writer,
index.documents_fields,
index.documents_fields_counts,
&mut ranked_map,
&mut indexer,
&schema,
field_id,
*document_id,
&value,
)?;
}
}
// 4. write the new index in the main store
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
// recompute all facet attributes after document update.
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let docids = index.main.internal_docids(writer)?;
let facet_map = facets::facet_map_from_docids(writer, index, &docids, attributes_for_facetting.as_ref())?;
index.facets.add(writer, facet_map)?;
}
// update is finished; update sorted document id cache with new state
let mut document_ids = index.main.internal_docids(writer)?.to_vec();
super::cache_document_ids_sorted(writer, &ranked_map, index, &mut document_ids)?;
Ok(())
}
pub fn write_documents_addition_index<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer<A>,
) -> MResult<()>
{
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match index.postings_lists.postings_list(writer, &word)? {
Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(),
None => delta_set,
};
index.postings_lists.put_postings_list(writer, &word, &set)?;
}
for (id, words) in indexed.docs_words {
index.docs_words.put_doc_words(writer, id, &words)?;
}
let delta_words = delta_words_builder.into_set();
let words_fst = index.main.words_fst(writer)?;
let words = if !words_fst.is_empty() {
let op = OpBuilder::new()
.add(words_fst.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder.into_set()
} else {
delta_words
};
index.main.put_words_fst(writer, &words)?;
index.main.put_ranked_map(writer, ranked_map)?;
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
compute_short_prefixes(writer, &words, index)?;
Ok(())
}

View File

@ -1,207 +0,0 @@
use std::collections::{BTreeSet, HashMap, HashSet};
use fst::{SetBuilder, Streamer};
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::facets;
use crate::store;
use crate::update::{next_update_id, compute_short_prefixes, Update};
use crate::{DocumentId, Error, MResult, RankedMap, MainWriter, Index};
pub struct DocumentsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
external_docids: Vec<String>,
}
impl DocumentsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsDeletion {
DocumentsDeletion {
updates_store,
updates_results_store,
updates_notifier,
external_docids: Vec::new(),
}
}
pub fn delete_document_by_external_docid(&mut self, document_id: String) {
self.external_docids.push(document_id);
}
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_documents_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.external_docids,
)?;
Ok(update_id)
}
}
impl Extend<String> for DocumentsDeletion {
fn extend<T: IntoIterator<Item=String>>(&mut self, iter: T) {
self.external_docids.extend(iter)
}
}
pub fn push_documents_deletion(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
external_docids: Vec<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::documents_deletion(external_docids);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_documents_deletion(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
external_docids: Vec<String>,
) -> MResult<()>
{
let (external_docids, internal_docids) = {
let new_external_docids = SetBuf::from_dirty(external_docids);
let mut internal_docids = Vec::new();
let old_external_docids = index.main.external_docids(writer)?;
for external_docid in new_external_docids.as_slice() {
if let Some(id) = old_external_docids.get(external_docid) {
internal_docids.push(DocumentId(id as u32));
}
}
let new_external_docids = fst::Map::from_iter(new_external_docids.into_iter().map(|k| (k, 0))).unwrap();
(new_external_docids, SetBuf::from_dirty(internal_docids))
};
let schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
// facet filters deletion
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let facet_map = facets::facet_map_from_docids(writer, &index, &internal_docids, &attributes_for_facetting)?;
index.facets.remove(writer, facet_map)?;
}
// collect the ranked attributes according to the schema
let ranked_fields = schema.ranked();
let mut words_document_ids = HashMap::new();
for id in internal_docids.iter().cloned() {
// remove all the ranked attributes from the ranked_map
for ranked_attr in ranked_fields {
ranked_map.remove(id, *ranked_attr);
}
let words = index.docs_words.doc_words(writer, id)?;
if !words.is_empty() {
let mut stream = words.stream();
while let Some(word) = stream.next() {
let word = word.to_vec();
words_document_ids
.entry(word)
.or_insert_with(Vec::new)
.push(id);
}
}
}
let mut deleted_documents = HashSet::new();
let mut removed_words = BTreeSet::new();
for (word, document_ids) in words_document_ids {
let document_ids = SetBuf::from_dirty(document_ids);
if let Some(postings) = index.postings_lists.postings_list(writer, &word)? {
let op = DifferenceByKey::new(&postings.matches, &document_ids, |d| d.document_id, |id| *id);
let doc_indexes = op.into_set_buf();
if !doc_indexes.is_empty() {
index.postings_lists.put_postings_list(writer, &word, &doc_indexes)?;
} else {
index.postings_lists.del_postings_list(writer, &word)?;
removed_words.insert(word);
}
}
for id in document_ids {
index.documents_fields_counts.del_all_document_fields_counts(writer, id)?;
if index.documents_fields.del_all_document_fields(writer, id)? != 0 {
deleted_documents.insert(id);
}
}
}
let deleted_documents_len = deleted_documents.len() as u64;
for id in &deleted_documents {
index.docs_words.del_doc_words(writer, *id)?;
}
let removed_words = fst::Set::from_iter(removed_words).unwrap();
let words = {
let words_set = index.main.words_fst(writer)?;
let op = fst::set::OpBuilder::new()
.add(words_set.stream())
.add(removed_words.stream())
.difference();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder.into_set()
};
index.main.put_words_fst(writer, &words)?;
index.main.put_ranked_map(writer, &ranked_map)?;
index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
// We apply the changes to the user and internal ids
index.main.remove_external_docids(writer, &external_docids)?;
index.main.remove_internal_docids(writer, &internal_docids)?;
compute_short_prefixes(writer, &words, index)?;
// update is finished; update sorted document id cache with new state
document_cache_remove_deleted(writer, index, &ranked_map, &deleted_documents)?;
Ok(())
}
/// rebuilds the document id cache by either removing deleted documents from the existing cache,
/// and generating a new one from docs in store
fn document_cache_remove_deleted(writer: &mut MainWriter, index: &Index, ranked_map: &RankedMap, documents_to_delete: &HashSet<DocumentId>) -> MResult<()> {
let new_cache = match index.main.sorted_document_ids_cache(writer)? {
// only keep documents that are not in the list of deleted documents. Order is preserved,
// no need to resort
Some(old_cache) => {
old_cache.iter().filter(|docid| !documents_to_delete.contains(docid)).cloned().collect::<Vec<_>>()
}
// couldn't find cached documents, try building a new cache from documents in store
None => {
let mut document_ids = index.main.internal_docids(writer)?.to_vec();
super::cache_document_ids_sorted(writer, ranked_map, index, &mut document_ids)?;
document_ids
}
};
index.main.put_sorted_document_ids_cache(writer, &new_cache)?;
Ok(())
}

View File

@ -1,142 +0,0 @@
use std::fmt::Write as _;
use indexmap::IndexMap;
use meilisearch_schema::IndexedPos;
use meilisearch_types::DocumentId;
use ordered_float::OrderedFloat;
use serde_json::Value;
use crate::Number;
use crate::raw_indexer::RawIndexer;
use crate::serde::SerializerError;
use crate::store::DiscoverIds;
/// Returns the number of words indexed or `None` if the type is unindexable.
pub fn index_value<A: AsRef<[u8]>>(
indexer: &mut RawIndexer<A>,
document_id: DocumentId,
indexed_pos: IndexedPos,
value: &Value,
) -> Option<usize>
{
match value {
Value::Null => None,
Value::Bool(boolean) => {
let text = boolean.to_string();
let number_of_words = indexer.index_text(document_id, indexed_pos, &text);
Some(number_of_words)
},
Value::Number(number) => {
let text = number.to_string();
Some(indexer.index_text(document_id, indexed_pos, &text))
},
Value::String(string) => {
Some(indexer.index_text(document_id, indexed_pos, &string))
},
Value::Array(_) => {
let text = value_to_string(value);
Some(indexer.index_text(document_id, indexed_pos, &text))
},
Value::Object(_) => {
let text = value_to_string(value);
Some(indexer.index_text(document_id, indexed_pos, &text))
},
}
}
/// Transforms the JSON Value type into a String.
pub fn value_to_string(value: &Value) -> String {
fn internal_value_to_string(string: &mut String, value: &Value) {
match value {
Value::Null => (),
Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); },
Value::Number(number) => { let _ = write!(string, "{}", &number); },
Value::String(text) => string.push_str(&text),
Value::Array(array) => {
for value in array {
internal_value_to_string(string, value);
let _ = string.write_str(". ");
}
},
Value::Object(object) => {
for (key, value) in object {
string.push_str(key);
let _ = string.write_str(". ");
internal_value_to_string(string, value);
let _ = string.write_str(". ");
}
},
}
}
let mut string = String::new();
internal_value_to_string(&mut string, value);
string
}
/// Transforms the JSON Value type into a Number.
pub fn value_to_number(value: &Value) -> Option<Number> {
use std::str::FromStr;
match value {
Value::Null => None,
Value::Bool(boolean) => Some(Number::Unsigned(*boolean as u64)),
Value::Number(number) => {
match (number.as_i64(), number.as_u64(), number.as_f64()) {
(Some(n), _, _) => Some(Number::Signed(n)),
(_, Some(n), _) => Some(Number::Unsigned(n)),
(_, _, Some(n)) => Some(Number::Float(OrderedFloat(n))),
(None, None, None) => None,
}
},
Value::String(string) => Number::from_str(string).ok(),
Value::Array(_array) => None,
Value::Object(_object) => None,
}
}
/// Validates a string representation to be a correct document id and returns
/// the corresponding id or generate a new one, this is the way we produce documents ids.
pub fn discover_document_id<F>(
docid: &str,
external_docids_get: F,
available_docids: &mut DiscoverIds<'_>,
) -> Result<DocumentId, SerializerError>
where
F: FnOnce(&str) -> Option<u32>
{
if docid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') {
match external_docids_get(docid) {
Some(id) => Ok(DocumentId(id)),
None => {
let internal_id = available_docids.next().expect("no more ids available");
Ok(internal_id)
},
}
} else {
Err(SerializerError::InvalidDocumentIdFormat)
}
}
/// Extracts and validates the document id of a document.
pub fn extract_document_id<F>(
primary_key: &str,
document: &IndexMap<String, Value>,
external_docids_get: F,
available_docids: &mut DiscoverIds<'_>,
) -> Result<(DocumentId, String), SerializerError>
where
F: FnOnce(&str) -> Option<u32>
{
match document.get(primary_key) {
Some(value) => {
let docid = match value {
Value::Number(number) => number.to_string(),
Value::String(string) => string.clone(),
_ => return Err(SerializerError::InvalidDocumentIdFormat),
};
discover_document_id(&docid, external_docids_get, available_docids).map(|id| (id, docid))
}
None => Err(SerializerError::DocumentIdNotFound),
}
}

View File

@ -1,391 +0,0 @@
mod clear_all;
mod customs_update;
mod documents_addition;
mod documents_deletion;
mod settings_update;
mod helpers;
pub use self::clear_all::{apply_clear_all, push_clear_all};
pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::helpers::{index_value, value_to_string, value_to_number, discover_document_id, extract_document_id};
pub use self::settings_update::{apply_settings_update, push_settings_update};
use std::cmp;
use std::time::Instant;
use chrono::{DateTime, Utc};
use fst::{IntoStreamer, Streamer};
use heed::Result as ZResult;
use indexmap::IndexMap;
use log::debug;
use sdset::Set;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use meilisearch_error::ErrorCode;
use meilisearch_types::DocumentId;
use crate::{store, MResult, RankedMap};
use crate::database::{MainT, UpdateT};
use crate::settings::SettingsUpdate;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Update {
data: UpdateData,
enqueued_at: DateTime<Utc>,
}
impl Update {
fn clear_all() -> Update {
Update {
data: UpdateData::ClearAll,
enqueued_at: Utc::now(),
}
}
fn customs(data: Vec<u8>) -> Update {
Update {
data: UpdateData::Customs(data),
enqueued_at: Utc::now(),
}
}
fn documents_addition(primary_key: Option<String>, documents: Vec<IndexMap<String, Value>>) -> Update {
Update {
data: UpdateData::DocumentsAddition{ documents, primary_key },
enqueued_at: Utc::now(),
}
}
fn documents_partial(primary_key: Option<String>, documents: Vec<IndexMap<String, Value>>) -> Update {
Update {
data: UpdateData::DocumentsPartial{ documents, primary_key },
enqueued_at: Utc::now(),
}
}
fn documents_deletion(data: Vec<String>) -> Update {
Update {
data: UpdateData::DocumentsDeletion(data),
enqueued_at: Utc::now(),
}
}
fn settings(data: SettingsUpdate) -> Update {
Update {
data: UpdateData::Settings(Box::new(data)),
enqueued_at: Utc::now(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateData {
ClearAll,
Customs(Vec<u8>),
// (primary key, documents)
DocumentsAddition {
primary_key: Option<String>,
documents: Vec<IndexMap<String, Value>>
},
DocumentsPartial {
primary_key: Option<String>,
documents: Vec<IndexMap<String, Value>>,
},
DocumentsDeletion(Vec<String>),
Settings(Box<SettingsUpdate>)
}
impl UpdateData {
pub fn update_type(&self) -> UpdateType {
match self {
UpdateData::ClearAll => UpdateType::ClearAll,
UpdateData::Customs(_) => UpdateType::Customs,
UpdateData::DocumentsAddition{ documents, .. } => UpdateType::DocumentsAddition {
number: documents.len(),
},
UpdateData::DocumentsPartial{ documents, .. } => UpdateType::DocumentsPartial {
number: documents.len(),
},
UpdateData::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
number: deletion.len(),
},
UpdateData::Settings(update) => UpdateType::Settings {
settings: update.clone(),
},
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "name")]
pub enum UpdateType {
ClearAll,
Customs,
DocumentsAddition { number: usize },
DocumentsPartial { number: usize },
DocumentsDeletion { number: usize },
Settings { settings: Box<SettingsUpdate> },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ProcessedUpdateResult {
pub update_id: u64,
#[serde(rename = "type")]
pub update_type: UpdateType,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_code: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_link: Option<String>,
pub duration: f64, // in seconds
pub enqueued_at: DateTime<Utc>,
pub processed_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct EnqueuedUpdateResult {
pub update_id: u64,
#[serde(rename = "type")]
pub update_type: UpdateType,
pub enqueued_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", tag = "status")]
pub enum UpdateStatus {
Enqueued {
#[serde(flatten)]
content: EnqueuedUpdateResult,
},
Failed {
#[serde(flatten)]
content: ProcessedUpdateResult,
},
Processed {
#[serde(flatten)]
content: ProcessedUpdateResult,
},
}
pub fn update_status(
update_reader: &heed::RoTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
) -> MResult<Option<UpdateStatus>> {
match updates_results_store.update_result(update_reader, update_id)? {
Some(result) => {
if result.error.is_some() {
Ok(Some(UpdateStatus::Failed { content: result }))
} else {
Ok(Some(UpdateStatus::Processed { content: result }))
}
},
None => match updates_store.get(update_reader, update_id)? {
Some(update) => Ok(Some(UpdateStatus::Enqueued {
content: EnqueuedUpdateResult {
update_id,
update_type: update.data.update_type(),
enqueued_at: update.enqueued_at,
},
})),
None => Ok(None),
},
}
}
pub fn next_update_id(
update_writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> ZResult<u64> {
let last_update = updates_store.last_update(update_writer)?;
let last_update = last_update.map(|(n, _)| n);
let last_update_results_id = updates_results_store.last_update(update_writer)?;
let last_update_results_id = last_update_results_id.map(|(n, _)| n);
let max_update_id = cmp::max(last_update, last_update_results_id);
let new_update_id = max_update_id.map_or(0, |n| n + 1);
Ok(new_update_id)
}
pub fn update_task(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
update_id: u64,
update: Update,
) -> MResult<ProcessedUpdateResult> {
debug!("Processing update number {}", update_id);
let Update { enqueued_at, data } = update;
let (update_type, result, duration) = match data {
UpdateData::ClearAll => {
let start = Instant::now();
let update_type = UpdateType::ClearAll;
let result = apply_clear_all(writer, index);
(update_type, result, start.elapsed())
}
UpdateData::Customs(customs) => {
let start = Instant::now();
let update_type = UpdateType::Customs;
let result = apply_customs_update(writer, index.main, &customs).map_err(Into::into);
(update_type, result, start.elapsed())
}
UpdateData::DocumentsAddition { documents, primary_key } => {
let start = Instant::now();
let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
let result = apply_documents_addition(writer, index, documents, primary_key);
(update_type, result, start.elapsed())
}
UpdateData::DocumentsPartial{ documents, primary_key } => {
let start = Instant::now();
let update_type = UpdateType::DocumentsPartial {
number: documents.len(),
};
let result = apply_documents_partial_addition(writer, index, documents, primary_key);
(update_type, result, start.elapsed())
}
UpdateData::DocumentsDeletion(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
let result = apply_documents_deletion(writer, index, documents);
(update_type, result, start.elapsed())
}
UpdateData::Settings(settings) => {
let start = Instant::now();
let update_type = UpdateType::Settings {
settings: settings.clone(),
};
let result = apply_settings_update(
writer,
index,
*settings,
);
(update_type, result, start.elapsed())
}
};
debug!(
"Processed update number {} {:?} {:?}",
update_id, update_type, result
);
let status = ProcessedUpdateResult {
update_id,
update_type,
error: result.as_ref().map_err(|e| e.to_string()).err(),
error_code: result.as_ref().map_err(|e| e.error_name()).err(),
error_type: result.as_ref().map_err(|e| e.error_type()).err(),
error_link: result.as_ref().map_err(|e| e.error_url()).err(),
duration: duration.as_secs_f64(),
enqueued_at,
processed_at: Utc::now(),
};
Ok(status)
}
fn compute_short_prefixes<A>(
writer: &mut heed::RwTxn<MainT>,
words_fst: &fst::Set<A>,
index: &store::Index,
) -> MResult<()>
where A: AsRef<[u8]>,
{
// clear the prefixes
let pplc_store = index.prefix_postings_lists_cache;
pplc_store.clear(writer)?;
for prefix_len in 1..=2 {
// compute prefixes and store those in the PrefixPostingsListsCache store.
let mut previous_prefix: Option<([u8; 4], Vec<_>)> = None;
let mut stream = words_fst.into_stream();
while let Some(input) = stream.next() {
// We skip the prefixes that are shorter than the current length
// we want to cache (<). We must ignore the input when it is exactly the
// same word as the prefix because if we match exactly on it we need
// to consider it as an exact match and not as a prefix (=).
if input.len() <= prefix_len { continue }
if let Some(postings_list) = index.postings_lists.postings_list(writer, input)?.map(|p| p.matches.into_owned()) {
let prefix = &input[..prefix_len];
let mut arr_prefix = [0; 4];
arr_prefix[..prefix_len].copy_from_slice(prefix);
match previous_prefix {
Some((ref mut prev_prefix, ref mut prev_pl)) if *prev_prefix != arr_prefix => {
prev_pl.sort_unstable();
prev_pl.dedup();
if let Ok(prefix) = std::str::from_utf8(&prev_prefix[..prefix_len]) {
debug!("writing the prefix of {:?} of length {}", prefix, prev_pl.len());
}
let pls = Set::new_unchecked(&prev_pl);
pplc_store.put_prefix_postings_list(writer, *prev_prefix, &pls)?;
*prev_prefix = arr_prefix;
prev_pl.clear();
prev_pl.extend_from_slice(&postings_list);
},
Some((_, ref mut prev_pl)) => prev_pl.extend_from_slice(&postings_list),
None => previous_prefix = Some((arr_prefix, postings_list.to_vec())),
}
}
}
// write the last prefix postings lists
if let Some((prev_prefix, mut prev_pl)) = previous_prefix.take() {
prev_pl.sort_unstable();
prev_pl.dedup();
let pls = Set::new_unchecked(&prev_pl);
pplc_store.put_prefix_postings_list(writer, prev_prefix, &pls)?;
}
}
Ok(())
}
fn cache_document_ids_sorted(
writer: &mut heed::RwTxn<MainT>,
ranked_map: &RankedMap,
index: &store::Index,
document_ids: &mut [DocumentId],
) -> MResult<()> {
crate::bucket_sort::placeholder_document_sort(document_ids, index, writer, ranked_map)?;
index.main.put_sorted_document_ids_cache(writer, &document_ids)
}

View File

@ -1,332 +0,0 @@
use std::{borrow::Cow, collections::{BTreeMap, BTreeSet}};
use heed::Result as ZResult;
use fst::{SetBuilder, set::OpBuilder};
use sdset::SetBuf;
use meilisearch_schema::Schema;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use crate::database::{MainT, UpdateT};
use crate::settings::{UpdateState, SettingsUpdate, RankingRule};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{store, MResult, Error};
pub fn push_settings_update(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
settings: SettingsUpdate,
) -> ZResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::settings(settings);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_settings_update(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
settings: SettingsUpdate,
) -> MResult<()> {
let mut must_reindex = false;
let mut schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => {
match settings.primary_key.clone() {
UpdateState::Update(id) => Schema::with_primary_key(&id),
_ => return Err(Error::MissingPrimaryKey)
}
}
};
match settings.ranking_rules {
UpdateState::Update(v) => {
let ranked_field: Vec<&str> = v.iter().filter_map(RankingRule::field).collect();
schema.update_ranked(&ranked_field)?;
index.main.put_ranking_rules(writer, &v)?;
must_reindex = true;
},
UpdateState::Clear => {
index.main.delete_ranking_rules(writer)?;
schema.clear_ranked();
must_reindex = true;
},
UpdateState::Nothing => (),
}
match settings.distinct_attribute {
UpdateState::Update(v) => {
let field_id = schema.insert(&v)?;
index.main.put_distinct_attribute(writer, field_id)?;
},
UpdateState::Clear => {
index.main.delete_distinct_attribute(writer)?;
},
UpdateState::Nothing => (),
}
match settings.searchable_attributes.clone() {
UpdateState::Update(v) => {
if v.iter().any(|e| e == "*") || v.is_empty() {
schema.set_all_searchable();
} else {
schema.update_searchable(v)?;
}
must_reindex = true;
},
UpdateState::Clear => {
schema.set_all_searchable();
must_reindex = true;
},
UpdateState::Nothing => (),
}
match settings.displayed_attributes.clone() {
UpdateState::Update(v) => {
if v.contains("*") || v.is_empty() {
schema.set_all_displayed();
} else {
schema.update_displayed(v)?
}
},
UpdateState::Clear => {
schema.set_all_displayed();
},
UpdateState::Nothing => (),
}
match settings.attributes_for_faceting {
UpdateState::Update(attrs) => {
apply_attributes_for_faceting_update(writer, index, &mut schema, &attrs)?;
must_reindex = true;
},
UpdateState::Clear => {
index.main.delete_attributes_for_faceting(writer)?;
index.facets.clear(writer)?;
},
UpdateState::Nothing => (),
}
index.main.put_schema(writer, &schema)?;
match settings.stop_words {
UpdateState::Update(stop_words) => {
if apply_stop_words_update(writer, index, stop_words)? {
must_reindex = true;
}
},
UpdateState::Clear => {
if apply_stop_words_update(writer, index, BTreeSet::new())? {
must_reindex = true;
}
},
UpdateState::Nothing => (),
}
match settings.synonyms {
UpdateState::Update(synonyms) => apply_synonyms_update(writer, index, synonyms)?,
UpdateState::Clear => apply_synonyms_update(writer, index, BTreeMap::new())?,
UpdateState::Nothing => (),
}
if must_reindex {
reindex_all_documents(writer, index)?;
}
Ok(())
}
fn apply_attributes_for_faceting_update(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
schema: &mut Schema,
attributes: &[String]
) -> MResult<()> {
let mut attribute_ids = Vec::new();
for name in attributes {
attribute_ids.push(schema.insert(name)?);
}
let attributes_for_faceting = SetBuf::from_dirty(attribute_ids);
index.main.put_attributes_for_faceting(writer, &attributes_for_faceting)?;
Ok(())
}
pub fn apply_stop_words_update(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
stop_words: BTreeSet<String>,
) -> MResult<bool>
{
let mut must_reindex = false;
let old_stop_words: BTreeSet<String> = index.main
.stop_words_fst(writer)?
.stream()
.into_strs()?
.into_iter()
.collect();
let deletion: BTreeSet<String> = old_stop_words.difference(&stop_words).cloned().collect();
let addition: BTreeSet<String> = stop_words.difference(&old_stop_words).cloned().collect();
if !addition.is_empty() {
apply_stop_words_addition(writer, index, addition)?;
}
if !deletion.is_empty() {
must_reindex = true;
apply_stop_words_deletion(writer, index, deletion)?;
}
let words_fst = index.main.words_fst(writer)?;
if !words_fst.is_empty() {
let stop_words = fst::Set::from_iter(stop_words)?;
let op = OpBuilder::new()
.add(&words_fst)
.add(&stop_words)
.difference();
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(op)?;
let words_fst = builder.into_set();
index.main.put_words_fst(writer, &words_fst)?;
index.main.put_stop_words_fst(writer, &stop_words)?;
}
Ok(must_reindex)
}
fn apply_stop_words_addition(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
addition: BTreeSet<String>,
) -> MResult<()>
{
let main_store = index.main;
let postings_lists_store = index.postings_lists;
let mut stop_words_builder = SetBuilder::memory();
for word in addition {
stop_words_builder.insert(&word)?;
// we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder.into_set();
// we also need to remove all the stop words from the main fst
let words_fst = main_store.words_fst(writer)?;
if !words_fst.is_empty() {
let op = OpBuilder::new()
.add(&words_fst)
.add(&delta_stop_words)
.difference();
let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op)?;
let word_fst = word_fst_builder.into_set();
main_store.put_words_fst(writer, &word_fst)?;
}
// now we add all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?;
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.r#union();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op)?;
let stop_words_fst = stop_words_builder.into_set();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
Ok(())
}
fn apply_stop_words_deletion(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word)?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder.into_set();
// now we delete all of these stop words from the main store
let stop_words_fst = index.main.stop_words_fst(writer)?;
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op)?;
let stop_words_fst = stop_words_builder.into_set();
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
}
pub fn apply_synonyms_update(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
synonyms: BTreeMap<String, Vec<String>>,
) -> MResult<()> {
let main_store = index.main;
let synonyms_store = index.synonyms;
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
fn normalize<T: AsRef<[u8]>>(analyzer: &Analyzer<T>, text: &str) -> String {
analyzer.analyze(&text)
.tokens()
.fold(String::new(), |s, t| s + t.text())
}
// normalize synonyms and reorder them creating a BTreeMap
let synonyms: BTreeMap<String, Vec<String>> = synonyms.into_iter().map( |(word, alternatives)| {
let word = normalize(&analyzer, &word);
let alternatives = alternatives.into_iter().map(|text| normalize(&analyzer, &text)).collect();
(word, alternatives)
}).collect();
// index synonyms,
// synyonyms have to be ordered by key before indexation
let mut synonyms_builder = SetBuilder::memory();
synonyms_store.clear(writer)?;
for (word, alternatives) in synonyms {
synonyms_builder.insert(&word)?;
let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives)?;
alternatives_builder.into_set()
};
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
}
let synonyms_set = synonyms_builder.into_set();
main_store.put_synonyms_fst(writer, &synonyms_set)?;
Ok(())
}

View File

@ -1,8 +1,8 @@
[package] [package]
name = "meilisearch-error" name = "meilisearch-error"
version = "0.20.0" version = "0.21.0"
authors = ["marin <postma.marin@protonmail.com>"] authors = ["marin <postma.marin@protonmail.com>"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
actix-http = "2.2.0" actix-http = "=3.0.0-beta.6"

View File

@ -81,7 +81,6 @@ pub enum Code {
} }
impl Code { impl Code {
/// ascociate a `Code` variant to the actual ErrCode /// ascociate a `Code` variant to the actual ErrCode
fn err_code(&self) -> ErrCode { fn err_code(&self) -> ErrCode {
use Code::*; use Code::*;
@ -94,17 +93,23 @@ impl Code {
// thrown when requesting an unexisting index // thrown when requesting an unexisting index
IndexNotFound => ErrCode::invalid("index_not_found", StatusCode::NOT_FOUND), IndexNotFound => ErrCode::invalid("index_not_found", StatusCode::NOT_FOUND),
InvalidIndexUid => ErrCode::invalid("invalid_index_uid", StatusCode::BAD_REQUEST), InvalidIndexUid => ErrCode::invalid("invalid_index_uid", StatusCode::BAD_REQUEST),
OpenIndex => ErrCode::internal("index_not_accessible", StatusCode::INTERNAL_SERVER_ERROR), OpenIndex => {
ErrCode::internal("index_not_accessible", StatusCode::INTERNAL_SERVER_ERROR)
}
// invalid state error // invalid state error
InvalidState => ErrCode::internal("invalid_state", StatusCode::INTERNAL_SERVER_ERROR), InvalidState => ErrCode::internal("invalid_state", StatusCode::INTERNAL_SERVER_ERROR),
// thrown when no primary key has been set // thrown when no primary key has been set
MissingPrimaryKey => ErrCode::invalid("missing_primary_key", StatusCode::BAD_REQUEST), MissingPrimaryKey => ErrCode::invalid("missing_primary_key", StatusCode::BAD_REQUEST),
// error thrown when trying to set an already existing primary key // error thrown when trying to set an already existing primary key
PrimaryKeyAlreadyPresent => ErrCode::invalid("primary_key_already_present", StatusCode::BAD_REQUEST), PrimaryKeyAlreadyPresent => {
ErrCode::invalid("primary_key_already_present", StatusCode::BAD_REQUEST)
}
// invalid document // invalid document
MaxFieldsLimitExceeded => ErrCode::invalid("max_fields_limit_exceeded", StatusCode::BAD_REQUEST), MaxFieldsLimitExceeded => {
ErrCode::invalid("max_fields_limit_exceeded", StatusCode::BAD_REQUEST)
}
MissingDocumentId => ErrCode::invalid("missing_document_id", StatusCode::BAD_REQUEST), MissingDocumentId => ErrCode::invalid("missing_document_id", StatusCode::BAD_REQUEST),
// error related to facets // error related to facets
@ -117,16 +122,26 @@ impl Code {
DocumentNotFound => ErrCode::invalid("document_not_found", StatusCode::NOT_FOUND), DocumentNotFound => ErrCode::invalid("document_not_found", StatusCode::NOT_FOUND),
Internal => ErrCode::internal("internal", StatusCode::INTERNAL_SERVER_ERROR), Internal => ErrCode::internal("internal", StatusCode::INTERNAL_SERVER_ERROR),
InvalidToken => ErrCode::authentication("invalid_token", StatusCode::FORBIDDEN), InvalidToken => ErrCode::authentication("invalid_token", StatusCode::FORBIDDEN),
MissingAuthorizationHeader => ErrCode::authentication("missing_authorization_header", StatusCode::UNAUTHORIZED), MissingAuthorizationHeader => {
ErrCode::authentication("missing_authorization_header", StatusCode::UNAUTHORIZED)
}
NotFound => ErrCode::invalid("not_found", StatusCode::NOT_FOUND), NotFound => ErrCode::invalid("not_found", StatusCode::NOT_FOUND),
PayloadTooLarge => ErrCode::invalid("payload_too_large", StatusCode::PAYLOAD_TOO_LARGE), PayloadTooLarge => ErrCode::invalid("payload_too_large", StatusCode::PAYLOAD_TOO_LARGE),
RetrieveDocument => ErrCode::internal("unretrievable_document", StatusCode::BAD_REQUEST), RetrieveDocument => {
ErrCode::internal("unretrievable_document", StatusCode::BAD_REQUEST)
}
SearchDocuments => ErrCode::internal("search_error", StatusCode::BAD_REQUEST), SearchDocuments => ErrCode::internal("search_error", StatusCode::BAD_REQUEST),
UnsupportedMediaType => ErrCode::invalid("unsupported_media_type", StatusCode::UNSUPPORTED_MEDIA_TYPE), UnsupportedMediaType => {
ErrCode::invalid("unsupported_media_type", StatusCode::UNSUPPORTED_MEDIA_TYPE)
}
// error related to dump // error related to dump
DumpAlreadyInProgress => ErrCode::invalid("dump_already_in_progress", StatusCode::CONFLICT), DumpAlreadyInProgress => {
DumpProcessFailed => ErrCode::internal("dump_process_failed", StatusCode::INTERNAL_SERVER_ERROR), ErrCode::invalid("dump_already_in_progress", StatusCode::CONFLICT)
}
DumpProcessFailed => {
ErrCode::internal("dump_process_failed", StatusCode::INTERNAL_SERVER_ERROR)
}
} }
} }

View File

@ -1,86 +1,123 @@
[package] [package]
name = "meilisearch-http" authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
description = "MeiliSearch HTTP server" description = "MeiliSearch HTTP server"
version = "0.20.0"
license = "MIT"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",
]
edition = "2018" edition = "2018"
license = "MIT"
name = "meilisearch-http"
version = "0.21.0"
[[bin]] [[bin]]
name = "meilisearch" name = "meilisearch"
path = "src/main.rs" path = "src/main.rs"
[features] [build-dependencies]
default = ["sentry"] actix-web-static-files = { git = "https://github.com/MarinPostma/actix-web-static-files.git", rev = "6db8c3e", optional = true }
anyhow = { version = "*", optional = true }
cargo_toml = { version = "0.9.0", optional = true }
hex = { version = "0.4.3", optional = true }
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false, optional = true }
sha-1 = { version = "0.9.4", optional = true }
tempfile = { version = "3.1.0", optional = true }
vergen = "3.1.0"
zip = { version = "0.5.12", optional = true }
[dependencies] [dependencies]
actix-cors = "0.5.4" actix-cors = { git = "https://github.com/MarinPostma/actix-extras.git", rev = "2dac1a4"}
actix-http = "2.2.0" actix-http = { version = "=3.0.0-beta.6" }
actix-rt = "1.1.1" actix-service = "2.0.0"
actix-service = "1.0.6" actix-web = { version = "=4.0.0-beta.6", features = ["rustls"] }
actix-web = { version = "3.3.2", features = ["rustls"] } actix-web-static-files = { git = "https://github.com/MarinPostma/actix-web-static-files.git", rev = "6db8c3e", optional = true }
bytes = "1.0.0" anyhow = "1.0.36"
async-stream = "0.3.0"
async-trait = "0.1.42"
arc-swap = "1.2.0"
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
bytes = "0.6.0"
chrono = { version = "0.4.19", features = ["serde"] } chrono = { version = "0.4.19", features = ["serde"] }
crossbeam-channel = "0.5.0" crossbeam-channel = "0.5.0"
either = "1.6.1"
env_logger = "0.8.2" env_logger = "0.8.2"
flate2 = "1.0.19" flate2 = "1.0.19"
futures = "0.3.8" fst = "0.4.5"
http = "0.2.2" futures = "0.3.7"
indexmap = { version = "1.6.1", features = ["serde-1"] } futures-util = "0.3.8"
log = "0.4.11" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
main_error = "0.1.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" }
meilisearch-core = { path = "../meilisearch-core", version = "0.20.0" } http = "0.2.1"
meilisearch-error = { path = "../meilisearch-error", version = "0.20.0" } indexmap = { version = "1.3.2", features = ["serde-1"] }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.20.0" } itertools = "0.10.0"
log = "0.4.8"
main_error = "0.1.0"
meilisearch-error = { path = "../meilisearch-error" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.3" }
memmap = "0.7.0"
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.7.0" }
mime = "0.3.16" mime = "0.3.16"
num_cpus = "1.13.0"
once_cell = "1.5.2" once_cell = "1.5.2"
rand = "0.8.1" oxidized-json-checker = "0.3.2"
parking_lot = "0.11.1"
rand = "0.7.3"
rayon = "1.5.0"
regex = "1.4.2" regex = "1.4.2"
rustls = "0.18.0" rustls = "0.19"
serde = { version = "1.0.118", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = { version = "1.0.61", features = ["preserve_order"] } serde_json = { version = "1.0.59", features = ["preserve_order"] }
serde_qs = "0.8.2" sha2 = "0.9.1"
sha2 = "0.9.2" siphasher = "0.3.2"
siphasher = "0.3.3"
slice-group-by = "0.2.6" slice-group-by = "0.2.6"
structopt = "0.3.21" structopt = "0.3.20"
tar = "0.4.30" tar = "0.4.29"
tempfile = "3.1.0" tempfile = "3.1.0"
tokio = { version = "0.2", features = ["macros"] } thiserror = "1.0.24"
ureq = { version = "2.0.0", features = ["tls"], default-features = false } tokio = { version = "1", features = ["full"] }
uuid = "0.8" uuid = { version = "0.8.2", features = ["serde"] }
walkdir = "2.3.1" walkdir = "2.3.2"
whoami = "1.0.3" obkv = "0.1.1"
pin-project = "1.0.7"
whoami = { version = "1.1.2", optional = true }
reqwest = { version = "0.11.3", features = ["json", "rustls-tls"], default-features = false, optional = true }
[dependencies.sentry] [dependencies.sentry]
version = "0.18.1"
default-features = false default-features = false
features = [ features = [
"with_client_implementation", "backtrace",
"with_panic", "contexts",
"with_failure", "panic",
"with_device_info", "reqwest",
"with_rust_info", "rustls",
"with_reqwest_transport", "log",
"with_rustls",
"with_env_logger"
] ]
optional = true optional = true
version = "0.22.0"
[dev-dependencies] [dev-dependencies]
actix-rt = "2.1.0"
assert-json-diff = { branch = "master", git = "https://github.com/qdequele/assert-json-diff" }
mockall = "0.9.1"
paste = "1.0.5"
serde_url_params = "0.2.0" serde_url_params = "0.2.0"
tempdir = "0.3.7" tempdir = "0.3.7"
tokio = { version = "0.2", features = ["macros", "time"] } urlencoding = "1.1.1"
[dev-dependencies.assert-json-diff] [features]
git = "https://github.com/qdequele/assert-json-diff" mini-dashboard = [
branch = "master" "actix-web-static-files",
"anyhow",
[build-dependencies] "cargo_toml",
vergen = "3.1.0" "hex",
"reqwest",
"sha-1",
"tempfile",
"zip",
]
analytics = ["sentry", "whoami", "reqwest"]
default = ["analytics", "mini-dashboard"]
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
jemallocator = "0.3.2" jemallocator = "0.3.2"
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.1.3/build.zip"
sha1 = "fea1780e13d8e570e35a1921e7a45cabcd501d5e"

View File

@ -7,4 +7,83 @@ fn main() {
// Generate the 'cargo:' key output // Generate the 'cargo:' key output
generate_cargo_keys(ConstantsFlags::all()).expect("Unable to generate the cargo keys!"); generate_cargo_keys(ConstantsFlags::all()).expect("Unable to generate the cargo keys!");
#[cfg(feature = "mini-dashboard")]
mini_dashboard::setup_mini_dashboard().expect("Could not load the mini-dashboard assets");
}
#[cfg(feature = "mini-dashboard")]
mod mini_dashboard {
use std::env;
use std::fs::{create_dir_all, File, OpenOptions};
use std::io::{Cursor, Read, Write};
use std::path::PathBuf;
use actix_web_static_files::resource_dir;
use anyhow::Context;
use cargo_toml::Manifest;
use reqwest::blocking::get;
use sha1::{Digest, Sha1};
pub fn setup_mini_dashboard() -> anyhow::Result<()> {
let cargo_manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
let cargo_toml = cargo_manifest_dir.join("Cargo.toml");
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
let sha1_path = out_dir.join(".mini-dashboard.sha1");
let dashboard_dir = out_dir.join("mini-dashboard");
let manifest = Manifest::from_path(cargo_toml).unwrap();
let meta = &manifest
.package
.as_ref()
.context("package not specified in Cargo.toml")?
.metadata
.as_ref()
.context("no metadata specified in Cargo.toml")?["mini-dashboard"];
// Check if there already is a dashboard built, and if it is up to date.
if sha1_path.exists() && dashboard_dir.exists() {
let mut sha1_file = File::open(&sha1_path)?;
let mut sha1 = String::new();
sha1_file.read_to_string(&mut sha1)?;
if sha1 == meta["sha1"].as_str().unwrap() {
// Nothing to do.
return Ok(());
}
}
let url = meta["assets-url"].as_str().unwrap();
let dashboard_assets_bytes = get(url)?.bytes()?;
let mut hasher = Sha1::new();
hasher.update(&dashboard_assets_bytes);
let sha1 = hex::encode(hasher.finalize());
assert_eq!(
meta["sha1"].as_str().unwrap(),
sha1,
"Downloaded mini-dashboard shasum differs from the one specified in the Cargo.toml"
);
create_dir_all(&dashboard_dir)?;
let cursor = Cursor::new(&dashboard_assets_bytes);
let mut zip = zip::read::ZipArchive::new(cursor)?;
zip.extract(&dashboard_dir)?;
resource_dir(&dashboard_dir).build()?;
// Write the sha1 for the dashboard back to file.
let mut file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(sha1_path)?;
file.write_all(sha1.as_bytes())?;
file.flush()?;
Ok(())
}
} }

File diff suppressed because one or more lines are too long

View File

@ -1,364 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="bulma.min.css">
<title>MeiliSearch</title>
<style>
em {
color: hsl(204, 86%, 25%);
font-style: inherit;
background-color: hsl(204, 86%, 88%);
}
#results {
max-width: 900px;
margin: 20px auto 0 auto;
padding: 0;
}
.notification {
display: flex;
justify-content: center;
}
.level-left {
margin-right: 50px;
}
.document {
border-radius: 4px;
margin-bottom: 20px;
display: flex;
}
.document ol {
flex: 0 0 75%;
max-width: 75%;
padding: 0;
margin: 0;
list-style-type: none;
}
.document ol li {
list-style: none;
}
.document .image {
max-width: 50%;
margin: 0 auto;
box-sizing: border-box;
}
@media screen and (min-width: 770px) {
.document .image {
max-width: 25%;
flex: 0 0 25%;
margin: 0;
padding-left: 30px;
box-sizing: border-box;
}
}
.document .image img {
width: 100%;
}
.attribute {
text-align: center;
box-sizing: border-box;
text-transform: uppercase;
font-weight: bold;
color: rgba(0,0,0,.7);
}
@media screen and (min-width: 770px) {
.attribute {
flex: 0 0 25%;
max-width: 25%;
text-align: right;
padding-right: 10px;
font-weight: normal;
box-sizing: border-box;
}
}
@media screen and (max-width: 770px) {
.attribute {
padding-bottom: 0;
}
}
.content {
flex: 0 0 75%;
box-sizing: border-box;
color: rgba(0,0,0,.9);
overflow-wrap: anywhere;
}
.hero-foot {
padding-bottom: 3rem;
}
@media screen and (max-width: 770px) {
.align-on-mobile {
text-align: center;
}
}
</style>
</head>
<body>
<section class="hero is-light">
<div class="hero-body">
<div class="container">
<div class="content is-medium align-on-mobile">
<h1 class="title is-1 is-spaced">
Welcome to MeiliSearch
</h1>
<p class="subtitle is-4">
This dashboard will help you check the search results with ease.
</p>
</div>
<div id="apiKeyContainer" class="columns">
<input type="hidden" id="apiKey">
</div>
<div class="columns">
<div class="column is-8">
<label class="label" for="search">Search something</label>
<div class="field has-addons">
<div class="control">
<span class="select">
<select role="listbox" id="index" aria-label="Select the index you want to search on">
<!-- indexes names -->
</select>
</span>
</div>
<div class="control is-expanded">
<input id="search" class="input" type="search" autofocus placeholder="e.g. George Clooney" aria-label="Search through your documents">
</div>
</div>
</div>
<div class="column is-4">
<div class="columns">
<div class="column is-6 has-text-centered">
<p class="heading">Documents</p>
<p id="count" class="title">0</p>
</div>
<div class="column is-6 has-text-centered">
<p class="heading">Time Spent</p>
<p id="time" class="title">N/A</p>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<section>
<div class="container">
<ol id="results" class="content">
<!-- documents matching resquests -->
</ol>
</div>
</section>
</body>
<script>
function setApiKeyField () {
var xmlHttp = new XMLHttpRequest();
xmlHttp.open("GET", `${baseUrl}/version`, false);
xmlHttp.onload = function () {
let apiKeyContainer = document.getElementById('apiKeyContainer');
if (xmlHttp.status === 401) {
document.getElementById('apiKey').remove();
let inputNode = document.createElement('input');
inputNode.setAttribute('id', 'apiKey');
inputNode.setAttribute('type', 'password');
inputNode.setAttribute('placeholder', 'Enter your API key');
inputNode.classList.add('input', 'is-small');
let controlNode = document.createElement('div');
controlNode.classList.add('control');
controlNode.appendChild(inputNode);
let labelNode = document.createElement('label');
labelNode.classList.add('label')
labelNode.setAttribute('for', 'apiKey');
let textNode = document.createTextNode('API Key');
labelNode.appendChild(textNode);
let fieldNode = document.createElement('div');
fieldNode.classList.add('field');
fieldNode.appendChild(labelNode);
fieldNode.append(controlNode);
let columnNode = document.createElement('div');
columnNode.classList.add('column', 'is-4');
columnNode.appendChild(fieldNode);
apiKeyContainer.appendChild(columnNode);
}
}
xmlHttp.send(null);
}
function sanitizeHTMLEntities(str) {
if (str && typeof str === 'string') {
str = str.replace(/</g,"&lt;");
str = str.replace(/>/g,"&gt;");
str = str.replace(/&lt;em&gt;/g,"<em>");
str = str.replace(/&lt;\/em&gt;/g,"<\/em>");
}
return str;
}
function httpGet(theUrl, apiKey) {
var xmlHttp = new XMLHttpRequest();
xmlHttp.open("GET", theUrl, false); // false for synchronous request
if (apiKey) {
xmlHttp.setRequestHeader("x-Meili-API-Key", apiKey);
}
xmlHttp.send(null);
return xmlHttp.responseText;
}
function refreshIndexList() {
// TODO we must not block here
let result = JSON.parse(httpGet(`${baseUrl}/indexes`, localStorage.getItem('apiKey')));
if (!Array.isArray(result)) { return }
let select = document.getElementById("index");
select.innerHTML = '';
for (index of result) {
const option = document.createElement('option');
option.value = index.uid;
option.innerHTML = index.name;
select.appendChild(option);
}
}
let lastRequest = undefined;
function triggerSearch() {
var e = document.getElementById("index");
if (e.selectedIndex == -1) { return }
var index = e.options[e.selectedIndex].value;
let theUrl = `${baseUrl}/indexes/${index}/search?q=${encodeURIComponent(search.value)}&attributesToHighlight=*`;
if (lastRequest) { lastRequest.abort() }
lastRequest = new XMLHttpRequest();
lastRequest.open("GET", theUrl, true);
if (localStorage.getItem('apiKey')) {
lastRequest.setRequestHeader("x-Meili-API-Key", localStorage.getItem('apiKey'));
}
lastRequest.onload = function (e) {
if (lastRequest.readyState === 4 && lastRequest.status === 200) {
let sanitizedResponseText = sanitizeHTMLEntities(lastRequest.responseText);
let httpResults = JSON.parse(sanitizedResponseText);
results.innerHTML = '';
let processingTimeMs = httpResults.processingTimeMs;
let numberOfDocuments = httpResults.nbHits;
time.innerHTML = `${processingTimeMs}ms`;
count.innerHTML = `${numberOfDocuments}`;
for (result of httpResults.hits) {
const element = {...result, ...result._formatted };
delete element._formatted;
const elem = document.createElement('li');
elem.classList.add("document","box");
const div = document.createElement('div');
div.classList.add("columns","is-desktop","is-tablet");
const info = document.createElement('div');
info.classList.add("column","align-on-mobile");
let image = undefined;
for (const prop in element) {
// Check if property is an image url link.
if (typeof result[prop] === 'string') {
if (image == undefined && result[prop].match(/^(https|http):\/\/.*(jpe?g|png|gif)(\?.*)?$/g)) {
image = result[prop];
}
}
const field = document.createElement('div');
field.classList.add("columns");
const attribute = document.createElement('div');
attribute.classList.add("attribute", "column");
attribute.innerHTML = prop;
const content = document.createElement('div');
content.classList.add("content", "column");
if (typeof (element[prop]) === "object") {
content.innerHTML = JSON.stringify(element[prop]);
} else {
content.innerHTML = element[prop];
}
field.appendChild(attribute);
field.appendChild(content);
info.appendChild(field);
}
div.appendChild(info);
elem.appendChild(div);
if (image != undefined) {
const divImage = document.createElement('div');
divImage.classList.add("image","column","align-on-mobile");
const img = document.createElement('img');
img.src = image;
img.setAttribute("alt","Item illustration");
divImage.appendChild(img);
div.appendChild(divImage);
elem.appendChild(div);
}
results.appendChild(elem)
}
} else {
console.error(lastRequest.statusText);
}
};
lastRequest.send(null);
}
if (!apiKey.value) {
apiKey.value = localStorage.getItem('apiKey');
}
apiKey.addEventListener('input', function(e) {
localStorage.setItem('apiKey', apiKey.value);
refreshIndexList();
}, false);
let baseUrl = window.location.origin;
setApiKeyField();
refreshIndexList();
search.oninput = triggerSearch;
let select = document.getElementById("index");
select.onchange = triggerSearch;
triggerSearch();
</script>
</html>

View File

@ -1,12 +1,9 @@
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use std::{error, thread};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use log::error; use log::debug;
use serde::Serialize; use serde::Serialize;
use serde_qs as qs;
use siphasher::sip::SipHasher; use siphasher::sip::SipHasher;
use walkdir::WalkDir;
use crate::Data; use crate::Data;
use crate::Opt; use crate::Opt;
@ -21,31 +18,21 @@ struct EventProperties {
} }
impl EventProperties { impl EventProperties {
fn from(data: Data) -> Result<EventProperties, Box<dyn error::Error>> { async fn from(data: Data) -> anyhow::Result<EventProperties> {
let mut index_list = Vec::new(); let stats = data.index_controller.get_all_stats().await?;
let reader = data.db.main_read_txn()?; let database_size = stats.database_size;
let last_update_timestamp = stats.last_update.map(|u| u.timestamp());
for index_uid in data.db.indexes_uids() { let number_of_documents = stats
if let Some(index) = data.db.open_index(&index_uid) { .indexes
let number_of_documents = index.main.number_of_documents(&reader)?; .values()
index_list.push(number_of_documents); .map(|index| index.number_of_documents)
} .collect();
}
let database_size = WalkDir::new(&data.db_path)
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len());
let last_update_timestamp = data.db.last_update(&reader)?.map(|u| u.timestamp());
Ok(EventProperties { Ok(EventProperties {
database_size, database_size,
last_update_timestamp, last_update_timestamp,
number_of_documents: index_list, number_of_documents,
}) })
} }
} }
@ -72,10 +59,10 @@ struct Event<'a> {
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
struct AmplitudeRequest<'a> { struct AmplitudeRequest<'a> {
api_key: &'a str, api_key: &'a str,
event: &'a str, events: Vec<Event<'a>>,
} }
pub fn analytics_sender(data: Data, opt: Opt) { pub async fn analytics_sender(data: Data, opt: Opt) {
let username = whoami::username(); let username = whoami::username();
let hostname = whoami::hostname(); let hostname = whoami::hostname();
let platform = whoami::platform(); let platform = whoami::platform();
@ -97,7 +84,7 @@ pub fn analytics_sender(data: Data, opt: Opt) {
let time = n.as_secs(); let time = n.as_secs();
let event_type = "runtime_tick"; let event_type = "runtime_tick";
let elapsed_since_start = first_start.elapsed().as_secs() / 86_400; // One day let elapsed_since_start = first_start.elapsed().as_secs() / 86_400; // One day
let event_properties = EventProperties::from(data.clone()).ok(); let event_properties = EventProperties::from(data.clone()).await.ok();
let app_version = env!("CARGO_PKG_VERSION").to_string(); let app_version = env!("CARGO_PKG_VERSION").to_string();
let app_version = app_version.as_str(); let app_version = app_version.as_str();
let user_email = std::env::var("MEILI_USER_EMAIL").ok(); let user_email = std::env::var("MEILI_USER_EMAIL").ok();
@ -116,27 +103,24 @@ pub fn analytics_sender(data: Data, opt: Opt) {
time, time,
app_version, app_version,
user_properties, user_properties,
event_properties event_properties,
}; };
let event = serde_json::to_string(&event).unwrap();
let request = AmplitudeRequest { let request = AmplitudeRequest {
api_key: AMPLITUDE_API_KEY, api_key: AMPLITUDE_API_KEY,
event: &event, events: vec![event],
}; };
let body = qs::to_string(&request).unwrap(); let response = reqwest::Client::new()
let response = ureq::post("https://api.amplitude.com/httpapi").send_string(&body); .post("https://api2.amplitude.com/2/httpapi")
match response { .timeout(Duration::from_secs(60)) // 1 minute max
Err(ureq::Error::Status(_ , response)) => { .json(&request)
error!("Unsuccessful call to Amplitude: {}", response.into_string().unwrap_or_default()); .send()
} .await;
Err(e) => { if let Err(e) = response {
error!("Unsuccessful call to Amplitude: {}", e); debug!("Unsuccessful call to Amplitude: {}", e);
}
_ => (),
} }
thread::sleep(Duration::from_secs(3600)) // one hour tokio::time::sleep(Duration::from_secs(3600)).await;
} }
} }

View File

@ -1,175 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use meilisearch_core::{Database, DatabaseOptions, Index};
use sha2::Digest;
use crate::error::{Error as MSError, ResponseError};
use crate::index_update_callback;
use crate::option::Opt;
use crate::dump::DumpInfo;
#[derive(Clone)]
pub struct Data {
inner: Arc<DataInner>,
}
impl Deref for Data {
type Target = DataInner;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
#[derive(Clone)]
pub struct DataInner {
pub db: Arc<Database>,
pub db_path: String,
pub dumps_dir: PathBuf,
pub dump_batch_size: usize,
pub api_keys: ApiKeys,
pub server_pid: u32,
pub http_payload_size_limit: usize,
pub current_dump: Arc<Mutex<Option<DumpInfo>>>,
}
#[derive(Clone)]
pub struct ApiKeys {
pub public: Option<String>,
pub private: Option<String>,
pub master: Option<String>,
}
impl ApiKeys {
pub fn generate_missing_api_keys(&mut self) {
if let Some(master_key) = &self.master {
if self.private.is_none() {
let key = format!("{}-private", master_key);
let sha = sha2::Sha256::digest(key.as_bytes());
self.private = Some(format!("{:x}", sha));
}
if self.public.is_none() {
let key = format!("{}-public", master_key);
let sha = sha2::Sha256::digest(key.as_bytes());
self.public = Some(format!("{:x}", sha));
}
}
}
}
impl Data {
pub fn new(opt: Opt) -> Result<Data, Box<dyn Error>> {
let db_path = opt.db_path.clone();
let dumps_dir = opt.dumps_dir.clone();
let dump_batch_size = opt.dump_batch_size;
let server_pid = std::process::id();
let db_opt = DatabaseOptions {
main_map_size: opt.max_mdb_size,
update_map_size: opt.max_udb_size,
};
let http_payload_size_limit = opt.http_payload_size_limit;
let db = Arc::new(Database::open_or_create(opt.db_path, db_opt)?);
let mut api_keys = ApiKeys {
master: opt.master_key,
private: None,
public: None,
};
api_keys.generate_missing_api_keys();
let current_dump = Arc::new(Mutex::new(None));
let inner_data = DataInner {
db: db.clone(),
db_path,
dumps_dir,
dump_batch_size,
api_keys,
server_pid,
http_payload_size_limit,
current_dump,
};
let data = Data {
inner: Arc::new(inner_data),
};
let callback_context = data.clone();
db.set_update_callback(Box::new(move |index_uid, status| {
index_update_callback(&index_uid, &callback_context, status);
}));
Ok(data)
}
fn create_index(&self, uid: &str) -> Result<Index, ResponseError> {
if !uid
.chars()
.all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_')
{
return Err(MSError::InvalidIndexUid.into());
}
let created_index = self.db.create_index(&uid).map_err(|e| match e {
meilisearch_core::Error::IndexAlreadyExists => e.into(),
_ => ResponseError::from(MSError::create_index(e)),
})?;
self.db.main_write::<_, _, ResponseError>(|mut writer| {
created_index.main.put_name(&mut writer, uid)?;
created_index
.main
.created_at(&writer)?
.ok_or(MSError::internal("Impossible to read created at"))?;
created_index
.main
.updated_at(&writer)?
.ok_or(MSError::internal("Impossible to read updated at"))?;
Ok(())
})?;
Ok(created_index)
}
pub fn get_current_dump_info(&self) -> Option<DumpInfo> {
self.current_dump.lock().unwrap().clone()
}
pub fn set_current_dump_info(&self, dump_info: DumpInfo) {
self.current_dump.lock().unwrap().replace(dump_info);
}
pub fn get_or_create_index<F, R>(&self, uid: &str, f: F) -> Result<R, ResponseError>
where
F: FnOnce(&Index) -> Result<R, ResponseError>,
{
let mut index_has_been_created = false;
let index = match self.db.open_index(&uid) {
Some(index) => index,
None => {
index_has_been_created = true;
self.create_index(&uid)?
}
};
match f(&index) {
Ok(r) => Ok(r),
Err(err) => {
if index_has_been_created {
let _ = self.db.delete_index(&uid);
}
Err(err)
}
}
}
}

View File

@ -0,0 +1,133 @@
use std::ops::Deref;
use std::sync::Arc;
use sha2::Digest;
use crate::index::{Checked, Settings};
use crate::index_controller::{
error::Result, DumpInfo, IndexController, IndexMetadata, IndexSettings, IndexStats, Stats,
};
use crate::option::Opt;
pub mod search;
mod updates;
#[derive(Clone)]
pub struct Data {
inner: Arc<DataInner>,
}
impl Deref for Data {
type Target = DataInner;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
pub struct DataInner {
pub index_controller: IndexController,
pub api_keys: ApiKeys,
options: Opt,
}
#[derive(Clone)]
pub struct ApiKeys {
pub public: Option<String>,
pub private: Option<String>,
pub master: Option<String>,
}
impl ApiKeys {
pub fn generate_missing_api_keys(&mut self) {
if let Some(master_key) = &self.master {
if self.private.is_none() {
let key = format!("{}-private", master_key);
let sha = sha2::Sha256::digest(key.as_bytes());
self.private = Some(format!("{:x}", sha));
}
if self.public.is_none() {
let key = format!("{}-public", master_key);
let sha = sha2::Sha256::digest(key.as_bytes());
self.public = Some(format!("{:x}", sha));
}
}
}
}
impl Data {
pub fn new(options: Opt) -> anyhow::Result<Data> {
let path = options.db_path.clone();
let index_controller = IndexController::new(&path, &options)?;
let mut api_keys = ApiKeys {
master: options.clone().master_key,
private: None,
public: None,
};
api_keys.generate_missing_api_keys();
let inner = DataInner {
index_controller,
api_keys,
options,
};
let inner = Arc::new(inner);
Ok(Data { inner })
}
pub async fn settings(&self, uid: String) -> Result<Settings<Checked>> {
self.index_controller.settings(uid).await
}
pub async fn list_indexes(&self) -> Result<Vec<IndexMetadata>> {
self.index_controller.list_indexes().await
}
pub async fn index(&self, uid: String) -> Result<IndexMetadata> {
self.index_controller.get_index(uid).await
}
pub async fn create_index(
&self,
uid: String,
primary_key: Option<String>,
) -> Result<IndexMetadata> {
let settings = IndexSettings {
uid: Some(uid),
primary_key,
};
let meta = self.index_controller.create_index(settings).await?;
Ok(meta)
}
pub async fn get_index_stats(&self, uid: String) -> Result<IndexStats> {
Ok(self.index_controller.get_index_stats(uid).await?)
}
pub async fn get_all_stats(&self) -> Result<Stats> {
Ok(self.index_controller.get_all_stats().await?)
}
pub async fn create_dump(&self) -> Result<DumpInfo> {
Ok(self.index_controller.create_dump().await?)
}
pub async fn dump_status(&self, uid: String) -> Result<DumpInfo> {
Ok(self.index_controller.dump_info(uid).await?)
}
#[inline]
pub fn http_payload_size_limit(&self) -> usize {
self.options.http_payload_size_limit.get_bytes() as usize
}
#[inline]
pub fn api_keys(&self) -> &ApiKeys {
&self.api_keys
}
}

View File

@ -0,0 +1,34 @@
use serde_json::{Map, Value};
use super::Data;
use crate::index::{SearchQuery, SearchResult};
use crate::index_controller::error::Result;
impl Data {
pub async fn search(&self, index: String, search_query: SearchQuery) -> Result<SearchResult> {
self.index_controller.search(index, search_query).await
}
pub async fn retrieve_documents(
&self,
index: String,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<Vec<Map<String, Value>>> {
self.index_controller
.documents(index, offset, limit, attributes_to_retrieve)
.await
}
pub async fn retrieve_document(
&self,
index: String,
document_id: String,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<Map<String, Value>> {
self.index_controller
.document(index, document_id, attributes_to_retrieve)
.await
}
}

View File

@ -0,0 +1,80 @@
use milli::update::{IndexDocumentsMethod, UpdateFormat};
use crate::extractors::payload::Payload;
use crate::index::{Checked, Settings};
use crate::index_controller::{error::Result, IndexMetadata, IndexSettings, UpdateStatus};
use crate::Data;
impl Data {
pub async fn add_documents(
&self,
index: String,
method: IndexDocumentsMethod,
format: UpdateFormat,
stream: Payload,
primary_key: Option<String>,
) -> Result<UpdateStatus> {
let update_status = self
.index_controller
.add_documents(index, method, format, stream, primary_key)
.await?;
Ok(update_status)
}
pub async fn update_settings(
&self,
index: String,
settings: Settings<Checked>,
create: bool,
) -> Result<UpdateStatus> {
let update = self
.index_controller
.update_settings(index, settings, create)
.await?;
Ok(update)
}
pub async fn clear_documents(&self, index: String) -> Result<UpdateStatus> {
let update = self.index_controller.clear_documents(index).await?;
Ok(update)
}
pub async fn delete_documents(
&self,
index: String,
document_ids: Vec<String>,
) -> Result<UpdateStatus> {
let update = self
.index_controller
.delete_documents(index, document_ids)
.await?;
Ok(update)
}
pub async fn delete_index(&self, index: String) -> Result<()> {
self.index_controller.delete_index(index).await?;
Ok(())
}
pub async fn get_update_status(&self, index: String, uid: u64) -> Result<UpdateStatus> {
self.index_controller.update_status(index, uid).await
}
pub async fn get_updates_status(&self, index: String) -> Result<Vec<UpdateStatus>> {
self.index_controller.all_update_status(index).await
}
pub async fn update_index(
&self,
uid: String,
primary_key: Option<String>,
new_uid: Option<String>,
) -> Result<IndexMetadata> {
let settings = IndexSettings {
uid: new_uid,
primary_key,
};
self.index_controller.update_index(uid, settings).await
}
}

View File

@ -1,412 +0,0 @@
use std::fs::{create_dir_all, File};
use std::io::prelude::*;
use std::path::{Path, PathBuf};
use std::thread;
use actix_web::web;
use chrono::offset::Utc;
use indexmap::IndexMap;
use log::{error, info};
use meilisearch_core::{MainWriter, MainReader, UpdateReader};
use meilisearch_core::settings::Settings;
use meilisearch_core::update::{apply_settings_update, apply_documents_addition};
use serde::{Deserialize, Serialize};
use serde_json::json;
use tempfile::TempDir;
use crate::Data;
use crate::error::{Error, ResponseError};
use crate::helpers::compression;
use crate::routes::index;
use crate::routes::index::IndexResponse;
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
enum DumpVersion {
V1,
}
impl DumpVersion {
const CURRENT: Self = Self::V1;
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DumpMetadata {
indexes: Vec<crate::routes::index::IndexResponse>,
db_version: String,
dump_version: DumpVersion,
}
impl DumpMetadata {
/// Create a DumpMetadata with the current dump version of meilisearch.
pub fn new(indexes: Vec<crate::routes::index::IndexResponse>, db_version: String) -> Self {
DumpMetadata {
indexes,
db_version,
dump_version: DumpVersion::CURRENT,
}
}
/// Extract DumpMetadata from `metadata.json` file present at provided `dir_path`
fn from_path(dir_path: &Path) -> Result<Self, Error> {
let path = dir_path.join("metadata.json");
let file = File::open(path)?;
let reader = std::io::BufReader::new(file);
let metadata = serde_json::from_reader(reader)?;
Ok(metadata)
}
/// Write DumpMetadata in `metadata.json` file at provided `dir_path`
fn to_path(&self, dir_path: &Path) -> Result<(), Error> {
let path = dir_path.join("metadata.json");
let file = File::create(path)?;
serde_json::to_writer(file, &self)?;
Ok(())
}
}
/// Extract Settings from `settings.json` file present at provided `dir_path`
fn settings_from_path(dir_path: &Path) -> Result<Settings, Error> {
let path = dir_path.join("settings.json");
let file = File::open(path)?;
let reader = std::io::BufReader::new(file);
let metadata = serde_json::from_reader(reader)?;
Ok(metadata)
}
/// Write Settings in `settings.json` file at provided `dir_path`
fn settings_to_path(settings: &Settings, dir_path: &Path) -> Result<(), Error> {
let path = dir_path.join("settings.json");
let file = File::create(path)?;
serde_json::to_writer(file, settings)?;
Ok(())
}
/// Import settings and documents of a dump with version `DumpVersion::V1` in specified index.
fn import_index_v1(
data: &Data,
dumps_dir: &Path,
index_uid: &str,
document_batch_size: usize,
write_txn: &mut MainWriter,
) -> Result<(), Error> {
// open index
let index = data
.db
.open_index(index_uid)
.ok_or(Error::index_not_found(index_uid))?;
// index dir path in dump dir
let index_path = &dumps_dir.join(index_uid);
// extract `settings.json` file and import content
let settings = settings_from_path(&index_path)?;
let settings = settings.to_update().map_err(|e| Error::dump_failed(format!("importing settings for index {}; {}", index_uid, e)))?;
apply_settings_update(write_txn, &index, settings)?;
// create iterator over documents in `documents.jsonl` to make batch importation
// create iterator over documents in `documents.jsonl` to make batch importation
let documents = {
let file = File::open(&index_path.join("documents.jsonl"))?;
let reader = std::io::BufReader::new(file);
let deserializer = serde_json::Deserializer::from_reader(reader);
deserializer.into_iter::<IndexMap<String, serde_json::Value>>()
};
// batch import document every `document_batch_size`:
// create a Vec to bufferize documents
let mut values = Vec::with_capacity(document_batch_size);
// iterate over documents
for document in documents {
// push document in buffer
values.push(document?);
// if buffer is full, create and apply a batch, and clean buffer
if values.len() == document_batch_size {
let batch = std::mem::replace(&mut values, Vec::with_capacity(document_batch_size));
apply_documents_addition(write_txn, &index, batch, None)?;
}
}
// apply documents remaining in the buffer
if !values.is_empty() {
apply_documents_addition(write_txn, &index, values, None)?;
}
// sync index information: stats, updated_at, last_update
if let Err(e) = crate::index_update_callback_txn(index, index_uid, data, write_txn) {
return Err(Error::Internal(e));
}
Ok(())
}
/// Import dump from `dump_path` in database.
pub fn import_dump(
data: &Data,
dump_path: &Path,
document_batch_size: usize,
) -> Result<(), Error> {
info!("Importing dump from {:?}...", dump_path);
// create a temporary directory
let tmp_dir = TempDir::new()?;
let tmp_dir_path = tmp_dir.path();
// extract dump in temporary directory
compression::from_tar_gz(dump_path, tmp_dir_path)?;
// read dump metadata
let metadata = DumpMetadata::from_path(&tmp_dir_path)?;
// choose importation function from DumpVersion of metadata
let import_index = match metadata.dump_version {
DumpVersion::V1 => import_index_v1,
};
// remove indexes which have same `uid` than indexes to import and create empty indexes
let existing_index_uids = data.db.indexes_uids();
for index in metadata.indexes.iter() {
if existing_index_uids.contains(&index.uid) {
data.db.delete_index(index.uid.clone())?;
}
index::create_index_sync(&data.db, index.uid.clone(), index.name.clone(), index.primary_key.clone())?;
}
// import each indexes content
data.db.main_write::<_, _, Error>(|mut writer| {
for index in metadata.indexes {
import_index(&data, tmp_dir_path, &index.uid, document_batch_size, &mut writer)?;
}
Ok(())
})?;
info!("Dump importation from {:?} succeed", dump_path);
Ok(())
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DumpStatus {
Done,
InProgress,
Failed,
}
#[derive(Debug, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct DumpInfo {
pub uid: String,
pub status: DumpStatus,
#[serde(skip_serializing_if = "Option::is_none", flatten)]
pub error: Option<serde_json::Value>,
}
impl DumpInfo {
pub fn new(uid: String, status: DumpStatus) -> Self {
Self { uid, status, error: None }
}
pub fn with_error(mut self, error: ResponseError) -> Self {
self.status = DumpStatus::Failed;
self.error = Some(json!(error));
self
}
pub fn dump_already_in_progress(&self) -> bool {
self.status == DumpStatus::InProgress
}
}
/// Generate uid from creation date
fn generate_uid() -> String {
Utc::now().format("%Y%m%d-%H%M%S%3f").to_string()
}
/// Infer dumps_dir from dump_uid
pub fn compressed_dumps_dir(dumps_dir: &Path, dump_uid: &str) -> PathBuf {
dumps_dir.join(format!("{}.dump", dump_uid))
}
/// Write metadata in dump
fn dump_metadata(data: &web::Data<Data>, dir_path: &Path, indexes: Vec<IndexResponse>) -> Result<(), Error> {
let (db_major, db_minor, db_patch) = data.db.version();
let metadata = DumpMetadata::new(indexes, format!("{}.{}.{}", db_major, db_minor, db_patch));
metadata.to_path(dir_path)
}
/// Export settings of provided index in dump
fn dump_index_settings(data: &web::Data<Data>, reader: &MainReader, dir_path: &Path, index_uid: &str) -> Result<(), Error> {
let settings = crate::routes::setting::get_all_sync(data, reader, index_uid)?;
settings_to_path(&settings, dir_path)
}
/// Export updates of provided index in dump
fn dump_index_updates(data: &web::Data<Data>, reader: &UpdateReader, dir_path: &Path, index_uid: &str) -> Result<(), Error> {
let updates_path = dir_path.join("updates.jsonl");
let updates = crate::routes::index::get_all_updates_status_sync(data, reader, index_uid)?;
let file = File::create(updates_path)?;
for update in updates {
serde_json::to_writer(&file, &update)?;
writeln!(&file)?;
}
Ok(())
}
/// Export documents of provided index in dump
fn dump_index_documents(data: &web::Data<Data>, reader: &MainReader, dir_path: &Path, index_uid: &str) -> Result<(), Error> {
let documents_path = dir_path.join("documents.jsonl");
let file = File::create(documents_path)?;
let dump_batch_size = data.dump_batch_size;
let mut offset = 0;
loop {
let documents = crate::routes::document::get_all_documents_sync(data, reader, index_uid, offset, dump_batch_size, None)?;
if documents.is_empty() { break; } else { offset += dump_batch_size; }
for document in documents {
serde_json::to_writer(&file, &document)?;
writeln!(&file)?;
}
}
Ok(())
}
/// Write error with a context.
fn fail_dump_process<E: std::error::Error>(data: &web::Data<Data>, dump_info: DumpInfo, context: &str, error: E) {
let error_message = format!("{}; {}", context, error);
error!("Something went wrong during dump process: {}", &error_message);
data.set_current_dump_info(dump_info.with_error(Error::dump_failed(error_message).into()))
}
/// Main function of dump.
fn dump_process(data: web::Data<Data>, dumps_dir: PathBuf, dump_info: DumpInfo) {
// open read transaction on Update
let update_reader = match data.db.update_read_txn() {
Ok(r) => r,
Err(e) => {
fail_dump_process(&data, dump_info, "creating RO transaction on updates", e);
return ;
}
};
// open read transaction on Main
let main_reader = match data.db.main_read_txn() {
Ok(r) => r,
Err(e) => {
fail_dump_process(&data, dump_info, "creating RO transaction on main", e);
return ;
}
};
// create a temporary directory
let tmp_dir = match TempDir::new() {
Ok(tmp_dir) => tmp_dir,
Err(e) => {
fail_dump_process(&data, dump_info, "creating temporary directory", e);
return ;
}
};
let tmp_dir_path = tmp_dir.path();
// fetch indexes
let indexes = match crate::routes::index::list_indexes_sync(&data, &main_reader) {
Ok(indexes) => indexes,
Err(e) => {
fail_dump_process(&data, dump_info, "listing indexes", e);
return ;
}
};
// create metadata
if let Err(e) = dump_metadata(&data, &tmp_dir_path, indexes.clone()) {
fail_dump_process(&data, dump_info, "generating metadata", e);
return ;
}
// export settings, updates and documents for each indexes
for index in indexes {
let index_path = tmp_dir_path.join(&index.uid);
// create index sub-dircetory
if let Err(e) = create_dir_all(&index_path) {
fail_dump_process(&data, dump_info, &format!("creating directory for index {}", &index.uid), e);
return ;
}
// export settings
if let Err(e) = dump_index_settings(&data, &main_reader, &index_path, &index.uid) {
fail_dump_process(&data, dump_info, &format!("generating settings for index {}", &index.uid), e);
return ;
}
// export documents
if let Err(e) = dump_index_documents(&data, &main_reader, &index_path, &index.uid) {
fail_dump_process(&data, dump_info, &format!("generating documents for index {}", &index.uid), e);
return ;
}
// export updates
if let Err(e) = dump_index_updates(&data, &update_reader, &index_path, &index.uid) {
fail_dump_process(&data, dump_info, &format!("generating updates for index {}", &index.uid), e);
return ;
}
}
// compress dump in a file named `{dump_uid}.dump` in `dumps_dir`
if let Err(e) = crate::helpers::compression::to_tar_gz(&tmp_dir_path, &compressed_dumps_dir(&dumps_dir, &dump_info.uid)) {
fail_dump_process(&data, dump_info, "compressing dump", e);
return ;
}
// update dump info to `done`
let resume = DumpInfo::new(
dump_info.uid,
DumpStatus::Done
);
data.set_current_dump_info(resume);
}
pub fn init_dump_process(data: &web::Data<Data>, dumps_dir: &Path) -> Result<DumpInfo, Error> {
create_dir_all(dumps_dir).map_err(|e| Error::dump_failed(format!("creating temporary directory {}", e)))?;
// check if a dump is already in progress
if let Some(resume) = data.get_current_dump_info() {
if resume.dump_already_in_progress() {
return Err(Error::dump_conflict())
}
}
// generate a new dump info
let info = DumpInfo::new(
generate_uid(),
DumpStatus::InProgress
);
data.set_current_dump_info(info.clone());
let data = data.clone();
let dumps_dir = dumps_dir.to_path_buf();
let info_cloned = info.clone();
// run dump process in a new thread
thread::spawn(move ||
dump_process(data, dumps_dir, info_cloned)
);
Ok(info)
}

View File

@ -1,307 +1,167 @@
use std::error; use std::error::Error;
use std::fmt; use std::fmt;
use actix_http::ResponseBuilder;
use actix_web as aweb; use actix_web as aweb;
use actix_web::error::{JsonPayloadError, QueryPayloadError}; use actix_web::body::Body;
use actix_web::dev::BaseHttpResponseBuilder;
use actix_web::http::StatusCode; use actix_web::http::StatusCode;
use serde::ser::{Serialize, Serializer, SerializeStruct}; use aweb::error::{JsonPayloadError, QueryPayloadError};
use meilisearch_error::{Code, ErrorCode};
use milli::UserError;
use serde::{Deserialize, Serialize};
use meilisearch_error::{ErrorCode, Code}; #[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
#[derive(Debug)]
pub struct ResponseError { pub struct ResponseError {
inner: Box<dyn ErrorCode>, #[serde(skip)]
} code: StatusCode,
message: String,
impl error::Error for ResponseError {} error_code: String,
error_type: String,
impl ErrorCode for ResponseError { error_link: String,
fn error_code(&self) -> Code {
self.inner.error_code()
}
} }
impl fmt::Display for ResponseError { impl fmt::Display for ResponseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.inner.fmt(f) self.message.fmt(f)
} }
} }
impl From<Error> for ResponseError { impl<T> From<T> for ResponseError
fn from(error: Error) -> ResponseError { where
ResponseError { inner: Box::new(error) } T: ErrorCode,
} {
} fn from(other: T) -> Self {
Self {
impl From<meilisearch_core::Error> for ResponseError { code: other.http_status(),
fn from(err: meilisearch_core::Error) -> ResponseError { message: other.to_string(),
ResponseError { inner: Box::new(err) } error_code: other.error_name(),
} error_type: other.error_type(),
} error_link: other.error_url(),
}
impl From<meilisearch_schema::Error> for ResponseError {
fn from(err: meilisearch_schema::Error) -> ResponseError {
ResponseError { inner: Box::new(err) }
}
}
impl From<FacetCountError> for ResponseError {
fn from(err: FacetCountError) -> ResponseError {
ResponseError { inner: Box::new(err) }
}
}
impl Serialize for ResponseError {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let struct_name = "ResponseError";
let field_count = 4;
let mut state = serializer.serialize_struct(struct_name, field_count)?;
state.serialize_field("message", &self.to_string())?;
state.serialize_field("errorCode", &self.error_name())?;
state.serialize_field("errorType", &self.error_type())?;
state.serialize_field("errorLink", &self.error_url())?;
state.end()
} }
} }
impl aweb::error::ResponseError for ResponseError { impl aweb::error::ResponseError for ResponseError {
fn error_response(&self) -> aweb::HttpResponse { fn error_response(&self) -> aweb::BaseHttpResponse<Body> {
ResponseBuilder::new(self.status_code()).json(&self) let json = serde_json::to_vec(self).unwrap();
BaseHttpResponseBuilder::new(self.status_code())
.content_type("application/json")
.body(json)
} }
fn status_code(&self) -> StatusCode { fn status_code(&self) -> StatusCode {
self.http_status() self.code
}
}
macro_rules! internal_error {
($target:ty : $($other:path), *) => {
$(
impl From<$other> for $target {
fn from(other: $other) -> Self {
Self::Internal(Box::new(other))
}
}
)*
} }
} }
#[derive(Debug)] #[derive(Debug)]
pub enum Error { pub struct MilliError<'a>(pub &'a milli::Error);
BadParameter(String, String),
BadRequest(String), impl Error for MilliError<'_> {}
CreateIndex(String),
DocumentNotFound(String), impl fmt::Display for MilliError<'_> {
IndexNotFound(String), fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
IndexAlreadyExists(String), self.0.fmt(f)
Internal(String), }
InvalidIndexUid,
InvalidToken(String),
MissingAuthorizationHeader,
NotFound(String),
OpenIndex(String),
RetrieveDocument(u32, String),
SearchDocuments(String),
PayloadTooLarge,
UnsupportedMediaType,
DumpAlreadyInProgress,
DumpProcessFailed(String),
} }
impl error::Error for Error {} impl ErrorCode for MilliError<'_> {
impl ErrorCode for Error {
fn error_code(&self) -> Code { fn error_code(&self) -> Code {
use Error::*; match self.0 {
milli::Error::InternalError(_) => Code::Internal,
milli::Error::IoError(_) => Code::Internal,
milli::Error::UserError(ref error) => {
match error {
// TODO: wait for spec for new error codes.
UserError::Csv(_)
| UserError::SerdeJson(_)
| UserError::MaxDatabaseSizeReached
| UserError::InvalidCriterionName { .. }
| UserError::InvalidDocumentId { .. }
| UserError::InvalidStoreFile
| UserError::NoSpaceLeftOnDevice
| UserError::DocumentLimitReached => Code::Internal,
UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded,
UserError::InvalidFilter(_) => Code::Filter,
UserError::InvalidFilterAttribute(_) => Code::Filter,
UserError::MissingDocumentId { .. } => Code::MissingDocumentId,
UserError::MissingPrimaryKey => Code::MissingPrimaryKey,
UserError::PrimaryKeyCannotBeChanged => Code::PrimaryKeyAlreadyPresent,
UserError::PrimaryKeyCannotBeReset => Code::PrimaryKeyAlreadyPresent,
UserError::UnknownInternalDocumentId { .. } => Code::DocumentNotFound,
UserError::InvalidFacetsDistribution { .. } => Code::BadRequest,
}
}
}
}
}
impl fmt::Display for PayloadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self { match self {
BadParameter(_, _) => Code::BadParameter, PayloadError::Json(e) => e.fmt(f),
BadRequest(_) => Code::BadRequest, PayloadError::Query(e) => e.fmt(f),
CreateIndex(_) => Code::CreateIndex,
DocumentNotFound(_) => Code::DocumentNotFound,
IndexNotFound(_) => Code::IndexNotFound,
IndexAlreadyExists(_) => Code::IndexAlreadyExists,
Internal(_) => Code::Internal,
InvalidIndexUid => Code::InvalidIndexUid,
InvalidToken(_) => Code::InvalidToken,
MissingAuthorizationHeader => Code::MissingAuthorizationHeader,
NotFound(_) => Code::NotFound,
OpenIndex(_) => Code::OpenIndex,
RetrieveDocument(_, _) => Code::RetrieveDocument,
SearchDocuments(_) => Code::SearchDocuments,
PayloadTooLarge => Code::PayloadTooLarge,
UnsupportedMediaType => Code::UnsupportedMediaType,
DumpAlreadyInProgress => Code::DumpAlreadyInProgress,
DumpProcessFailed(_) => Code::DumpProcessFailed,
} }
} }
} }
#[derive(Debug)] #[derive(Debug)]
pub enum FacetCountError { pub enum PayloadError {
AttributeNotSet(String), Json(JsonPayloadError),
SyntaxError(String), Query(QueryPayloadError),
UnexpectedToken { found: String, expected: &'static [&'static str] },
NoFacetSet,
} }
impl error::Error for FacetCountError {} impl Error for PayloadError {}
impl ErrorCode for FacetCountError { impl ErrorCode for PayloadError {
fn error_code(&self) -> Code { fn error_code(&self) -> Code {
Code::BadRequest
}
}
impl FacetCountError {
pub fn unexpected_token(found: impl ToString, expected: &'static [&'static str]) -> FacetCountError {
let found = found.to_string();
FacetCountError::UnexpectedToken { expected, found }
}
}
impl From<serde_json::error::Error> for FacetCountError {
fn from(other: serde_json::error::Error) -> FacetCountError {
FacetCountError::SyntaxError(other.to_string())
}
}
impl fmt::Display for FacetCountError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use FacetCountError::*;
match self { match self {
AttributeNotSet(attr) => write!(f, "Attribute {} is not set as facet", attr), PayloadError::Json(err) => match err {
SyntaxError(msg) => write!(f, "Syntax error: {}", msg), JsonPayloadError::Overflow => Code::PayloadTooLarge,
UnexpectedToken { expected, found } => write!(f, "Unexpected {} found, expected {:?}", found, expected), JsonPayloadError::ContentType => Code::UnsupportedMediaType,
NoFacetSet => write!(f, "Can't perform facet count, as no facet is set"), JsonPayloadError::Payload(aweb::error::PayloadError::Overflow) => {
Code::PayloadTooLarge
}
JsonPayloadError::Deserialize(_) | JsonPayloadError::Payload(_) => Code::BadRequest,
JsonPayloadError::Serialize(_) => Code::Internal,
_ => Code::Internal,
},
PayloadError::Query(err) => match err {
QueryPayloadError::Deserialize(_) => Code::BadRequest,
_ => Code::Internal,
},
} }
} }
} }
impl Error { impl From<JsonPayloadError> for PayloadError {
pub fn internal(err: impl fmt::Display) -> Error { fn from(other: JsonPayloadError) -> Self {
Error::Internal(err.to_string()) Self::Json(other)
}
pub fn bad_request(err: impl fmt::Display) -> Error {
Error::BadRequest(err.to_string())
}
pub fn missing_authorization_header() -> Error {
Error::MissingAuthorizationHeader
}
pub fn invalid_token(err: impl fmt::Display) -> Error {
Error::InvalidToken(err.to_string())
}
pub fn not_found(err: impl fmt::Display) -> Error {
Error::NotFound(err.to_string())
}
pub fn index_not_found(err: impl fmt::Display) -> Error {
Error::IndexNotFound(err.to_string())
}
pub fn document_not_found(err: impl fmt::Display) -> Error {
Error::DocumentNotFound(err.to_string())
}
pub fn bad_parameter(param: impl fmt::Display, err: impl fmt::Display) -> Error {
Error::BadParameter(param.to_string(), err.to_string())
}
pub fn open_index(err: impl fmt::Display) -> Error {
Error::OpenIndex(err.to_string())
}
pub fn create_index(err: impl fmt::Display) -> Error {
Error::CreateIndex(err.to_string())
}
pub fn invalid_index_uid() -> Error {
Error::InvalidIndexUid
}
pub fn retrieve_document(doc_id: u32, err: impl fmt::Display) -> Error {
Error::RetrieveDocument(doc_id, err.to_string())
}
pub fn search_documents(err: impl fmt::Display) -> Error {
Error::SearchDocuments(err.to_string())
}
pub fn dump_conflict() -> Error {
Error::DumpAlreadyInProgress
}
pub fn dump_failed(message: String) -> Error {
Error::DumpProcessFailed(message)
} }
} }
impl fmt::Display for Error { impl From<QueryPayloadError> for PayloadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn from(other: QueryPayloadError) -> Self {
match self { Self::Query(other)
Self::BadParameter(param, err) => write!(f, "Url parameter {} error: {}", param, err),
Self::BadRequest(err) => f.write_str(err),
Self::CreateIndex(err) => write!(f, "Impossible to create index; {}", err),
Self::DocumentNotFound(document_id) => write!(f, "Document with id {} not found", document_id),
Self::IndexNotFound(index_uid) => write!(f, "Index {} not found", index_uid),
Self::IndexAlreadyExists(index_uid) => write!(f, "Index {} already exists", index_uid),
Self::Internal(err) => f.write_str(err),
Self::InvalidIndexUid => f.write_str("Index must have a valid uid; Index uid can be of type integer or string only composed of alphanumeric characters, hyphens (-) and underscores (_)."),
Self::InvalidToken(err) => write!(f, "Invalid API key: {}", err),
Self::MissingAuthorizationHeader => f.write_str("You must have an authorization token"),
Self::NotFound(err) => write!(f, "{} not found", err),
Self::OpenIndex(err) => write!(f, "Impossible to open index; {}", err),
Self::RetrieveDocument(id, err) => write!(f, "Impossible to retrieve the document with id: {}; {}", id, err),
Self::SearchDocuments(err) => write!(f, "Impossible to search documents; {}", err),
Self::PayloadTooLarge => f.write_str("Payload too large"),
Self::UnsupportedMediaType => f.write_str("Unsupported media type"),
Self::DumpAlreadyInProgress => f.write_str("Another dump is already in progress"),
Self::DumpProcessFailed(message) => write!(f, "Dump process failed: {}", message),
}
} }
} }
impl From<std::io::Error> for Error { pub fn payload_error_handler<E>(err: E) -> ResponseError
fn from(err: std::io::Error) -> Error { where
Error::Internal(err.to_string()) E: Into<PayloadError>,
} {
} err.into().into()
impl From<actix_http::Error> for Error {
fn from(err: actix_http::Error) -> Error {
Error::Internal(err.to_string())
}
}
impl From<meilisearch_core::Error> for Error {
fn from(err: meilisearch_core::Error) -> Error {
Error::Internal(err.to_string())
}
}
impl From<serde_json::error::Error> for Error {
fn from(err: serde_json::error::Error) -> Error {
Error::Internal(err.to_string())
}
}
impl From<JsonPayloadError> for Error {
fn from(err: JsonPayloadError) -> Error {
match err {
JsonPayloadError::Deserialize(err) => Error::BadRequest(format!("Invalid JSON: {}", err)),
JsonPayloadError::Overflow => Error::PayloadTooLarge,
JsonPayloadError::ContentType => Error::UnsupportedMediaType,
JsonPayloadError::Payload(err) => Error::BadRequest(format!("Problem while decoding the request: {}", err)),
}
}
}
impl From<QueryPayloadError> for Error {
fn from(err: QueryPayloadError) -> Error {
match err {
QueryPayloadError::Deserialize(err) => Error::BadRequest(format!("Invalid query parameters: {}", err)),
}
}
}
pub fn payload_error_handler<E: Into<Error>>(err: E) -> ResponseError {
let error: Error = err.into();
error.into()
} }

View File

@ -0,0 +1,25 @@
use meilisearch_error::{Code, ErrorCode};
#[derive(Debug, thiserror::Error)]
pub enum AuthenticationError {
#[error("You must have an authorization token")]
MissingAuthorizationHeader,
#[error("Invalid API key")]
InvalidToken(String),
// Triggered on configuration error.
#[error("Irretrievable state")]
IrretrievableState,
#[error("Unknown authentication policy")]
UnknownPolicy,
}
impl ErrorCode for AuthenticationError {
fn error_code(&self) -> Code {
match self {
AuthenticationError::MissingAuthorizationHeader => Code::MissingAuthorizationHeader,
AuthenticationError::InvalidToken(_) => Code::InvalidToken,
AuthenticationError::IrretrievableState => Code::Internal,
AuthenticationError::UnknownPolicy => Code::Internal,
}
}
}

View File

@ -0,0 +1,182 @@
mod error;
use std::any::{Any, TypeId};
use std::collections::HashMap;
use std::marker::PhantomData;
use std::ops::Deref;
use actix_web::FromRequest;
use futures::future::err;
use futures::future::{ok, Ready};
use crate::error::ResponseError;
use error::AuthenticationError;
macro_rules! create_policies {
($($name:ident), *) => {
pub mod policies {
use std::collections::HashSet;
use crate::extractors::authentication::Policy;
$(
#[derive(Debug, Default)]
pub struct $name {
inner: HashSet<Vec<u8>>
}
impl $name {
pub fn new() -> Self {
Self { inner: HashSet::new() }
}
pub fn add(&mut self, token: Vec<u8>) {
self.inner.insert(token);
}
}
impl Policy for $name {
fn authenticate(&self, token: &[u8]) -> bool {
self.inner.contains(token)
}
}
)*
}
};
}
create_policies!(Public, Private, Admin);
/// Instanciate a `Policies`, filled with the given policies.
macro_rules! init_policies {
($($name:ident), *) => {
{
let mut policies = crate::extractors::authentication::Policies::new();
$(
let policy = $name::new();
policies.insert(policy);
)*
policies
}
};
}
/// Adds user to all specified policies.
macro_rules! create_users {
($policies:ident, $($user:expr => { $($policy:ty), * }), *) => {
{
$(
$(
$policies.get_mut::<$policy>().map(|p| p.add($user.to_owned()));
)*
)*
}
};
}
pub struct GuardedData<T, D> {
data: D,
_marker: PhantomData<T>,
}
impl<T, D> Deref for GuardedData<T, D> {
type Target = D;
fn deref(&self) -> &Self::Target {
&self.data
}
}
pub trait Policy {
fn authenticate(&self, token: &[u8]) -> bool;
}
#[derive(Debug)]
pub struct Policies {
inner: HashMap<TypeId, Box<dyn Any>>,
}
impl Policies {
pub fn new() -> Self {
Self {
inner: HashMap::new(),
}
}
pub fn insert<S: Policy + 'static>(&mut self, policy: S) {
self.inner.insert(TypeId::of::<S>(), Box::new(policy));
}
pub fn get<S: Policy + 'static>(&self) -> Option<&S> {
self.inner
.get(&TypeId::of::<S>())
.and_then(|p| p.downcast_ref::<S>())
}
pub fn get_mut<S: Policy + 'static>(&mut self) -> Option<&mut S> {
self.inner
.get_mut(&TypeId::of::<S>())
.and_then(|p| p.downcast_mut::<S>())
}
}
impl Default for Policies {
fn default() -> Self {
Self::new()
}
}
pub enum AuthConfig {
NoAuth,
Auth(Policies),
}
impl Default for AuthConfig {
fn default() -> Self {
Self::NoAuth
}
}
impl<P: Policy + 'static, D: 'static + Clone> FromRequest for GuardedData<P, D> {
type Config = AuthConfig;
type Error = ResponseError;
type Future = Ready<Result<Self, Self::Error>>;
fn from_request(
req: &actix_web::HttpRequest,
_payload: &mut actix_http::Payload,
) -> Self::Future {
match req.app_data::<Self::Config>() {
Some(config) => match config {
AuthConfig::NoAuth => match req.app_data::<D>().cloned() {
Some(data) => ok(Self {
data,
_marker: PhantomData,
}),
None => err(AuthenticationError::IrretrievableState.into()),
},
AuthConfig::Auth(policies) => match policies.get::<P>() {
Some(policy) => match req.headers().get("x-meili-api-key") {
Some(token) => {
if policy.authenticate(token.as_bytes()) {
match req.app_data::<D>().cloned() {
Some(data) => ok(Self {
data,
_marker: PhantomData,
}),
None => err(AuthenticationError::IrretrievableState.into()),
}
} else {
err(AuthenticationError::InvalidToken(String::from("hello")).into())
}
}
None => err(AuthenticationError::MissingAuthorizationHeader.into()),
},
None => err(AuthenticationError::UnknownPolicy.into()),
},
},
None => err(AuthenticationError::IrretrievableState.into()),
}
}
}

View File

@ -0,0 +1,3 @@
pub mod payload;
#[macro_use]
pub mod authentication;

View File

@ -0,0 +1,69 @@
use std::pin::Pin;
use std::task::{Context, Poll};
use actix_http::error::PayloadError;
use actix_web::{dev, web, FromRequest, HttpRequest};
use futures::future::{ready, Ready};
use futures::Stream;
pub struct Payload {
payload: dev::Payload,
limit: usize,
}
pub struct PayloadConfig {
limit: usize,
}
impl PayloadConfig {
pub fn new(limit: usize) -> Self {
Self { limit }
}
}
impl Default for PayloadConfig {
fn default() -> Self {
Self { limit: 256 * 1024 }
}
}
impl FromRequest for Payload {
type Config = PayloadConfig;
type Error = PayloadError;
type Future = Ready<Result<Payload, Self::Error>>;
#[inline]
fn from_request(req: &HttpRequest, payload: &mut dev::Payload) -> Self::Future {
let limit = req
.app_data::<PayloadConfig>()
.map(|c| c.limit)
.unwrap_or(Self::Config::default().limit);
ready(Ok(Payload {
payload: payload.take(),
limit,
}))
}
}
impl Stream for Payload {
type Item = Result<web::Bytes, PayloadError>;
#[inline]
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match Pin::new(&mut self.payload).poll_next(cx) {
Poll::Ready(Some(result)) => match result {
Ok(bytes) => match self.limit.checked_sub(bytes.len()) {
Some(new_limit) => {
self.limit = new_limit;
Poll::Ready(Some(Ok(bytes)))
}
None => Poll::Ready(Some(Err(PayloadError::Overflow))),
},
x => Poll::Ready(Some(x)),
},
otherwise => otherwise,
}
}
}

View File

@ -1,107 +0,0 @@
use std::cell::RefCell;
use std::pin::Pin;
use std::rc::Rc;
use std::task::{Context, Poll};
use actix_service::{Service, Transform};
use actix_web::{dev::ServiceRequest, dev::ServiceResponse, web};
use futures::future::{err, ok, Future, Ready};
use actix_web::error::ResponseError as _;
use actix_web::dev::Body;
use crate::error::{Error, ResponseError};
use crate::Data;
#[derive(Clone)]
pub enum Authentication {
Public,
Private,
Admin,
}
impl<S: 'static> Transform<S> for Authentication
where
S: Service<Request = ServiceRequest, Response = ServiceResponse<Body>, Error = actix_web::Error>,
S::Future: 'static,
{
type Request = ServiceRequest;
type Response = ServiceResponse<Body>;
type Error = actix_web::Error;
type InitError = ();
type Transform = LoggingMiddleware<S>;
type Future = Ready<Result<Self::Transform, Self::InitError>>;
fn new_transform(&self, service: S) -> Self::Future {
ok(LoggingMiddleware {
acl: self.clone(),
service: Rc::new(RefCell::new(service)),
})
}
}
pub struct LoggingMiddleware<S> {
acl: Authentication,
service: Rc<RefCell<S>>,
}
#[allow(clippy::type_complexity)]
impl<S> Service for LoggingMiddleware<S>
where
S: Service<Request = ServiceRequest, Response = ServiceResponse<Body>, Error = actix_web::Error> + 'static,
S::Future: 'static,
{
type Request = ServiceRequest;
type Response = ServiceResponse<Body>;
type Error = actix_web::Error;
type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>>>>;
fn poll_ready(&mut self, cx: &mut Context) -> Poll<Result<(), Self::Error>> {
self.service.poll_ready(cx)
}
fn call(&mut self, req: ServiceRequest) -> Self::Future {
let mut svc = self.service.clone();
// This unwrap is left because this error should never appear. If that's the case, then
// it means that actix-web has an issue or someone changes the type `Data`.
let data = req.app_data::<web::Data<Data>>().unwrap();
if data.api_keys.master.is_none() {
return Box::pin(svc.call(req));
}
let auth_header = match req.headers().get("X-Meili-API-Key") {
Some(auth) => match auth.to_str() {
Ok(auth) => auth,
Err(_) => {
let error = ResponseError::from(Error::MissingAuthorizationHeader).error_response();
let (request, _) = req.into_parts();
return Box::pin(ok(ServiceResponse::new(request, error)))
}
},
None => {
return Box::pin(err(ResponseError::from(Error::MissingAuthorizationHeader).into()));
}
};
let authenticated = match self.acl {
Authentication::Admin => data.api_keys.master.as_deref() == Some(auth_header),
Authentication::Private => {
data.api_keys.master.as_deref() == Some(auth_header)
|| data.api_keys.private.as_deref() == Some(auth_header)
}
Authentication::Public => {
data.api_keys.master.as_deref() == Some(auth_header)
|| data.api_keys.private.as_deref() == Some(auth_header)
|| data.api_keys.public.as_deref() == Some(auth_header)
}
};
if authenticated {
Box::pin(svc.call(req))
} else {
let error = ResponseError::from(Error::InvalidToken(auth_header.to_string())).error_response();
let (request, _) = req.into_parts();
Box::pin(ok(ServiceResponse::new(request, error)))
}
}
}

View File

@ -1,35 +1,26 @@
use flate2::Compression; use std::fs::{create_dir_all, File};
use flate2::read::GzDecoder; use std::io::Write;
use flate2::write::GzEncoder;
use std::fs::{create_dir_all, rename, File};
use std::path::Path; use std::path::Path;
use tar::{Builder, Archive};
use uuid::Uuid;
use crate::error::Error; use flate2::{read::GzDecoder, write::GzEncoder, Compression};
use tar::{Archive, Builder};
pub fn to_tar_gz(src: &Path, dest: &Path) -> Result<(), Error> { pub fn to_tar_gz(src: impl AsRef<Path>, dest: impl AsRef<Path>) -> anyhow::Result<()> {
let file_name = format!(".{}", Uuid::new_v4().to_urn()); let mut f = File::create(dest)?;
let p = dest.with_file_name(file_name); let gz_encoder = GzEncoder::new(&mut f, Compression::default());
let tmp_dest = p.as_path();
let f = File::create(tmp_dest)?;
let gz_encoder = GzEncoder::new(f, Compression::default());
let mut tar_encoder = Builder::new(gz_encoder); let mut tar_encoder = Builder::new(gz_encoder);
tar_encoder.append_dir_all(".", src)?; tar_encoder.append_dir_all(".", src)?;
let gz_encoder = tar_encoder.into_inner()?; let gz_encoder = tar_encoder.into_inner()?;
gz_encoder.finish()?; gz_encoder.finish()?;
f.flush()?;
rename(tmp_dest, dest)?;
Ok(()) Ok(())
} }
pub fn from_tar_gz(src: &Path, dest: &Path) -> Result<(), Error> { pub fn from_tar_gz(src: impl AsRef<Path>, dest: impl AsRef<Path>) -> anyhow::Result<()> {
let f = File::open(src)?; let f = File::open(&src)?;
let gz = GzDecoder::new(f); let gz = GzDecoder::new(f);
let mut ar = Archive::new(gz); let mut ar = Archive::new(gz);
create_dir_all(dest)?; create_dir_all(&dest)?;
ar.unpack(dest)?; ar.unpack(&dest)?;
Ok(()) Ok(())
} }

View File

@ -0,0 +1,16 @@
use walkdir::WalkDir;
pub trait EnvSizer {
fn size(&self) -> u64;
}
impl EnvSizer for heed::Env {
fn size(&self) -> u64 {
WalkDir::new(self.path())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len())
}
}

View File

@ -1,649 +0,0 @@
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};
use std::time::Instant;
use indexmap::IndexMap;
use log::error;
use meilisearch_core::{Filter, MainReader};
use meilisearch_core::facets::FacetFilter;
use meilisearch_core::criterion::*;
use meilisearch_core::settings::RankingRule;
use meilisearch_core::{Highlight, Index, RankedMap};
use meilisearch_schema::{FieldId, Schema};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
use slice_group_by::GroupBy;
use crate::error::{Error, ResponseError};
pub trait IndexSearchExt {
fn new_search(&self, query: Option<String>) -> SearchBuilder;
}
impl IndexSearchExt for Index {
fn new_search(&self, query: Option<String>) -> SearchBuilder {
SearchBuilder {
index: self,
query,
offset: 0,
limit: 20,
attributes_to_crop: None,
attributes_to_retrieve: None,
attributes_to_highlight: None,
filters: None,
matches: false,
facet_filters: None,
facets: None,
}
}
}
pub struct SearchBuilder<'a> {
index: &'a Index,
query: Option<String>,
offset: usize,
limit: usize,
attributes_to_crop: Option<HashMap<String, usize>>,
attributes_to_retrieve: Option<HashSet<String>>,
attributes_to_highlight: Option<HashSet<String>>,
filters: Option<String>,
matches: bool,
facet_filters: Option<FacetFilter>,
facets: Option<Vec<(FieldId, String)>>
}
impl<'a> SearchBuilder<'a> {
pub fn offset(&mut self, value: usize) -> &SearchBuilder {
self.offset = value;
self
}
pub fn limit(&mut self, value: usize) -> &SearchBuilder {
self.limit = value;
self
}
pub fn attributes_to_crop(&mut self, value: HashMap<String, usize>) -> &SearchBuilder {
self.attributes_to_crop = Some(value);
self
}
pub fn attributes_to_retrieve(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_retrieve = Some(value);
self
}
pub fn add_retrievable_field(&mut self, value: String) -> &SearchBuilder {
let attributes_to_retrieve = self.attributes_to_retrieve.get_or_insert(HashSet::new());
attributes_to_retrieve.insert(value);
self
}
pub fn attributes_to_highlight(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_highlight = Some(value);
self
}
pub fn add_facet_filters(&mut self, filters: FacetFilter) -> &SearchBuilder {
self.facet_filters = Some(filters);
self
}
pub fn filters(&mut self, value: String) -> &SearchBuilder {
self.filters = Some(value);
self
}
pub fn get_matches(&mut self) -> &SearchBuilder {
self.matches = true;
self
}
pub fn add_facets(&mut self, facets: Vec<(FieldId, String)>) -> &SearchBuilder {
self.facets = Some(facets);
self
}
pub fn search(self, reader: &MainReader) -> Result<SearchResult, ResponseError> {
let schema = self
.index
.main
.schema(reader)?
.ok_or(Error::internal("missing schema"))?;
let ranked_map = self.index.main.ranked_map(reader)?.unwrap_or_default();
// Change criteria
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
Some(criteria) => self.index.query_builder_with_criteria(criteria),
None => self.index.query_builder(),
};
if let Some(filter_expression) = &self.filters {
let filter = Filter::parse(filter_expression, &schema)?;
let index = &self.index;
query_builder.with_filter(move |id| {
let reader = &reader;
let filter = &filter;
match filter.test(reader, index, id) {
Ok(res) => res,
Err(e) => {
log::warn!("unexpected error during filtering: {}", e);
false
}
}
});
}
if let Some(field) = self.index.main.distinct_attribute(reader)? {
let index = &self.index;
query_builder.with_distinct(1, move |id| {
match index.document_attribute_bytes(reader, id, field) {
Ok(Some(bytes)) => {
let mut s = SipHasher::new();
bytes.hash(&mut s);
Some(s.finish())
}
_ => None,
}
});
}
query_builder.set_facet_filter(self.facet_filters);
query_builder.set_facets(self.facets);
let start = Instant::now();
let result = query_builder.query(reader, self.query.as_deref(), self.offset..(self.offset + self.limit));
let search_result = result.map_err(Error::search_documents)?;
let time_ms = start.elapsed().as_millis() as usize;
let mut all_attributes: HashSet<&str> = HashSet::new();
let mut all_formatted: HashSet<&str> = HashSet::new();
match &self.attributes_to_retrieve {
Some(to_retrieve) => {
all_attributes.extend(to_retrieve.iter().map(String::as_str));
if let Some(to_highlight) = &self.attributes_to_highlight {
all_formatted.extend(to_highlight.iter().map(String::as_str));
}
if let Some(to_crop) = &self.attributes_to_crop {
all_formatted.extend(to_crop.keys().map(String::as_str));
}
all_attributes.extend(&all_formatted);
},
None => {
all_attributes.extend(schema.displayed_names());
// If we specified at least one attribute to highlight or crop then
// all available attributes will be returned in the _formatted field.
if self.attributes_to_highlight.is_some() || self.attributes_to_crop.is_some() {
all_formatted.extend(all_attributes.iter().cloned());
}
},
}
let mut hits = Vec::with_capacity(self.limit);
for doc in search_result.documents {
let mut document: IndexMap<String, Value> = self
.index
.document(reader, Some(&all_attributes), doc.id)
.map_err(|e| Error::retrieve_document(doc.id.0, e))?
.unwrap_or_default();
let mut formatted = document.iter()
.filter(|(key, _)| all_formatted.contains(key.as_str()))
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
let mut matches = doc.highlights.clone();
// Crops fields if needed
if let Some(fields) = &self.attributes_to_crop {
crop_document(&mut formatted, &mut matches, &schema, fields);
}
// Transform to readable matches
if let Some(attributes_to_highlight) = &self.attributes_to_highlight {
let matches = calculate_matches(
&matches,
self.attributes_to_highlight.clone(),
&schema,
);
formatted = calculate_highlights(&formatted, &matches, attributes_to_highlight);
}
let matches_info = if self.matches {
Some(calculate_matches(&matches, self.attributes_to_retrieve.clone(), &schema))
} else {
None
};
if let Some(attributes_to_retrieve) = &self.attributes_to_retrieve {
document.retain(|key, _| attributes_to_retrieve.contains(&key.to_string()))
}
let hit = SearchHit {
document,
formatted,
matches_info,
};
hits.push(hit);
}
let results = SearchResult {
hits,
offset: self.offset,
limit: self.limit,
nb_hits: search_result.nb_hits,
exhaustive_nb_hits: search_result.exhaustive_nb_hit,
processing_time_ms: time_ms,
query: self.query.unwrap_or_default(),
facets_distribution: search_result.facets,
exhaustive_facets_count: search_result.exhaustive_facets_count,
};
Ok(results)
}
pub fn get_criteria(
&self,
reader: &MainReader,
ranked_map: &'a RankedMap,
schema: &Schema,
) -> Result<Option<Criteria<'a>>, ResponseError> {
let ranking_rules = self.index.main.ranking_rules(reader)?;
if let Some(ranking_rules) = ranking_rules {
let mut builder = CriteriaBuilder::with_capacity(7 + ranking_rules.len());
for rule in ranking_rules {
match rule {
RankingRule::Typo => builder.push(Typo),
RankingRule::Words => builder.push(Words),
RankingRule::Proximity => builder.push(Proximity),
RankingRule::Attribute => builder.push(Attribute),
RankingRule::WordsPosition => builder.push(WordsPosition),
RankingRule::Exactness => builder.push(Exactness),
RankingRule::Asc(field) => {
match SortByAttr::lower_is_better(&ranked_map, &schema, &field) {
Ok(rule) => builder.push(rule),
Err(err) => error!("Error during criteria builder; {:?}", err),
}
}
RankingRule::Desc(field) => {
match SortByAttr::higher_is_better(&ranked_map, &schema, &field) {
Ok(rule) => builder.push(rule),
Err(err) => error!("Error during criteria builder; {:?}", err),
}
}
}
}
builder.push(DocumentId);
return Ok(Some(builder.build()));
}
Ok(None)
}
}
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct MatchPosition {
pub start: usize,
pub length: usize,
}
impl PartialOrd for MatchPosition {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for MatchPosition {
fn cmp(&self, other: &Self) -> Ordering {
match self.start.cmp(&other.start) {
Ordering::Equal => self.length.cmp(&other.length),
_ => self.start.cmp(&other.start),
}
}
}
pub type HighlightInfos = HashMap<String, Value>;
pub type MatchesInfos = HashMap<String, Vec<MatchPosition>>;
// pub type RankingInfos = HashMap<String, u64>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchHit {
#[serde(flatten)]
pub document: IndexMap<String, Value>,
#[serde(rename = "_formatted", skip_serializing_if = "IndexMap::is_empty")]
pub formatted: IndexMap<String, Value>,
#[serde(rename = "_matchesInfo", skip_serializing_if = "Option::is_none")]
pub matches_info: Option<MatchesInfos>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub offset: usize,
pub limit: usize,
pub nb_hits: usize,
pub exhaustive_nb_hits: bool,
pub processing_time_ms: usize,
pub query: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub facets_distribution: Option<HashMap<String, HashMap<String, usize>>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub exhaustive_facets_count: Option<bool>,
}
/// returns the start index and the length on the crop.
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c);
let word_end_index = |mut index| {
if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
index += text.chars().skip(index).take_while(is_word_component).count();
}
index
};
if context == 0 {
// count need to be at least 1 for cjk queries to return something
return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count());
}
let start = match match_index.saturating_sub(context) {
0 => 0,
n => {
let word_end_index = word_end_index(n);
// skip whitespaces if any
word_end_index + text.chars().skip(word_end_index).take_while(char::is_ascii_whitespace).count()
}
};
let end = word_end_index(match_index + context);
(start, end - start)
}
fn crop_text(
text: &str,
matches: impl IntoIterator<Item = Highlight>,
context: usize,
) -> (String, Vec<Highlight>) {
let mut matches = matches.into_iter().peekable();
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
let (start, count) = aligned_crop(text, char_index, context);
// TODO do something about double allocation
let text = text
.chars()
.skip(start)
.take(count)
.collect::<String>()
.trim()
.to_string();
// update matches index to match the new cropped text
let matches = matches
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + count)
.map(|m| Highlight {
char_index: m.char_index - start as u16,
..m
})
.collect();
(text, matches)
}
fn crop_document(
document: &mut IndexMap<String, Value>,
matches: &mut Vec<Highlight>,
schema: &Schema,
fields: &HashMap<String, usize>,
) {
matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
for (field, length) in fields {
let attribute = match schema.id(field) {
Some(attribute) => attribute,
None => continue,
};
let selected_matches = matches
.iter()
.filter(|m| FieldId::new(m.attribute) == attribute)
.cloned();
if let Some(Value::String(ref mut original_text)) = document.get_mut(field) {
let (cropped_text, cropped_matches) =
crop_text(original_text, selected_matches, *length);
*original_text = cropped_text;
matches.retain(|m| FieldId::new(m.attribute) != attribute);
matches.extend_from_slice(&cropped_matches);
}
}
}
fn calculate_matches(
matches: &[Highlight],
attributes_to_retrieve: Option<HashSet<String>>,
schema: &Schema,
) -> MatchesInfos {
let mut matches_result: HashMap<String, Vec<MatchPosition>> = HashMap::new();
for m in matches.iter() {
if let Some(attribute) = schema.name(FieldId::new(m.attribute)) {
if let Some(ref attributes_to_retrieve) = attributes_to_retrieve {
if !attributes_to_retrieve.contains(attribute) {
continue;
}
}
if !schema.displayed_names().contains(&attribute) {
continue;
}
if let Some(pos) = matches_result.get_mut(attribute) {
pos.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
} else {
let mut positions = Vec::new();
positions.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
matches_result.insert(attribute.to_string(), positions);
}
}
}
for (_, val) in matches_result.iter_mut() {
val.sort_unstable();
val.dedup();
}
matches_result
}
fn calculate_highlights(
document: &IndexMap<String, Value>,
matches: &MatchesInfos,
attributes_to_highlight: &HashSet<String>,
) -> IndexMap<String, Value> {
let mut highlight_result = document.clone();
for (attribute, matches) in matches.iter() {
if attributes_to_highlight.contains(attribute) {
if let Some(Value::String(value)) = document.get(attribute) {
let value = value;
let mut highlighted_value = String::new();
let mut index = 0;
let longest_matches = matches
.linear_group_by_key(|m| m.start)
.map(|group| group.last().unwrap())
.filter(move |m| m.start >= index);
for m in longest_matches {
let before = value.get(index..m.start);
let highlighted = value.get(m.start..(m.start + m.length));
if let (Some(before), Some(highlighted)) = (before, highlighted) {
highlighted_value.push_str(before);
highlighted_value.push_str("<em>");
highlighted_value.push_str(highlighted);
highlighted_value.push_str("</em>");
index = m.start + m.length;
} else {
error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
}
}
highlighted_value.push_str(&value[index..]);
highlight_result.insert(attribute.to_string(), Value::String(highlighted_value));
};
}
}
highlight_result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn aligned_crops() {
let text = r#"En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."#;
// simple test
let (start, length) = aligned_crop(&text, 6, 2);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("début", cropped);
// first word test
let (start, length) = aligned_crop(&text, 0, 1);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("En", cropped);
// last word test
let (start, length) = aligned_crop(&text, 510, 2);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("Fondation", cropped);
// CJK tests
let text = "this isのス foo myタイリ test";
// mixed charset
let (start, length) = aligned_crop(&text, 5, 3);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("isの", cropped);
// split regular word / CJK word, no space
let (start, length) = aligned_crop(&text, 7, 1);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("", cropped);
}
#[test]
fn calculate_matches() {
let mut matches = Vec::new();
matches.push(Highlight { attribute: 0, char_index: 0, char_length: 3});
matches.push(Highlight { attribute: 0, char_index: 0, char_length: 2});
let mut attributes_to_retrieve: HashSet<String> = HashSet::new();
attributes_to_retrieve.insert("title".to_string());
let schema = Schema::with_primary_key("title");
let matches_result = super::calculate_matches(&matches, Some(attributes_to_retrieve), &schema);
let mut matches_result_expected: HashMap<String, Vec<MatchPosition>> = HashMap::new();
let mut positions = Vec::new();
positions.push(MatchPosition {
start: 0,
length: 2,
});
positions.push(MatchPosition {
start: 0,
length: 3,
});
matches_result_expected.insert("title".to_string(), positions);
assert_eq!(matches_result, matches_result_expected);
}
#[test]
fn calculate_highlights() {
let data = r#"{
"title": "Fondation (Isaac ASIMOV)",
"description": "En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."
}"#;
let document: IndexMap<String, Value> = serde_json::from_str(data).unwrap();
let mut attributes_to_highlight = HashSet::new();
attributes_to_highlight.insert("title".to_string());
attributes_to_highlight.insert("description".to_string());
let mut matches = HashMap::new();
let mut m = Vec::new();
m.push(MatchPosition {
start: 0,
length: 9,
});
matches.insert("title".to_string(), m);
let mut m = Vec::new();
m.push(MatchPosition {
start: 529,
length: 9,
});
matches.insert("description".to_string(), m);
let result = super::calculate_highlights(&document, &matches, &attributes_to_highlight);
let mut result_expected = IndexMap::new();
result_expected.insert(
"title".to_string(),
Value::String("<em>Fondation</em> (Isaac ASIMOV)".to_string()),
);
result_expected.insert("description".to_string(), Value::String("En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la <em>Fondation</em>.".to_string()));
assert_eq!(result, result_expected);
}
#[test]
fn highlight_longest_match() {
let data = r#"{
"title": "Ice"
}"#;
let document: IndexMap<String, Value> = serde_json::from_str(data).unwrap();
let mut attributes_to_highlight = HashSet::new();
attributes_to_highlight.insert("title".to_string());
let mut matches = HashMap::new();
let mut m = Vec::new();
m.push(MatchPosition {
start: 0,
length: 2,
});
m.push(MatchPosition {
start: 0,
length: 3,
});
matches.insert("title".to_string(), m);
let result = super::calculate_highlights(&document, &matches, &attributes_to_highlight);
let mut result_expected = IndexMap::new();
result_expected.insert(
"title".to_string(),
Value::String("<em>Ice</em>".to_string()),
);
assert_eq!(result, result_expected);
}
}

View File

@ -1,26 +1,4 @@
pub mod authentication;
pub mod meilisearch;
pub mod normalize_path;
pub mod compression; pub mod compression;
mod env;
pub use authentication::Authentication; pub use env::EnvSizer;
pub use normalize_path::NormalizePath;
pub fn is_cjk(c: char) -> bool {
('\u{1100}'..'\u{11ff}').contains(&c) // Hangul Jamo
|| ('\u{2e80}'..'\u{2eff}').contains(&c) // CJK Radicals Supplement
|| ('\u{2f00}'..'\u{2fdf}').contains(&c) // Kangxi radical
|| ('\u{3000}'..'\u{303f}').contains(&c) // Japanese-style punctuation
|| ('\u{3040}'..'\u{309f}').contains(&c) // Japanese Hiragana
|| ('\u{30a0}'..'\u{30ff}').contains(&c) // Japanese Katakana
|| ('\u{3100}'..'\u{312f}').contains(&c)
|| ('\u{3130}'..'\u{318F}').contains(&c) // Hangul Compatibility Jamo
|| ('\u{3200}'..'\u{32ff}').contains(&c) // Enclosed CJK Letters and Months
|| ('\u{3400}'..'\u{4dbf}').contains(&c) // CJK Unified Ideographs Extension A
|| ('\u{4e00}'..'\u{9fff}').contains(&c) // CJK Unified Ideographs
|| ('\u{a960}'..'\u{a97f}').contains(&c) // Hangul Jamo Extended-A
|| ('\u{ac00}'..'\u{d7a3}').contains(&c) // Hangul Syllables
|| ('\u{d7b0}'..'\u{d7ff}').contains(&c) // Hangul Jamo Extended-B
|| ('\u{f900}'..'\u{faff}').contains(&c) // CJK Compatibility Ideographs
|| ('\u{ff00}'..'\u{ffef}').contains(&c) // Full-width roman characters and half-width katakana
}

View File

@ -1,86 +0,0 @@
/// From https://docs.rs/actix-web/3.0.0-alpha.2/src/actix_web/middleware/normalize.rs.html#34
use actix_http::Error;
use actix_service::{Service, Transform};
use actix_web::{
dev::ServiceRequest,
dev::ServiceResponse,
http::uri::{PathAndQuery, Uri},
};
use futures::future::{ok, Ready};
use regex::Regex;
use std::task::{Context, Poll};
pub struct NormalizePath;
impl<S, B> Transform<S> for NormalizePath
where
S: Service<Request = ServiceRequest, Response = ServiceResponse<B>, Error = Error>,
S::Future: 'static,
{
type Request = ServiceRequest;
type Response = ServiceResponse<B>;
type Error = Error;
type InitError = ();
type Transform = NormalizePathNormalization<S>;
type Future = Ready<Result<Self::Transform, Self::InitError>>;
fn new_transform(&self, service: S) -> Self::Future {
ok(NormalizePathNormalization {
service,
merge_slash: Regex::new("//+").unwrap(),
})
}
}
pub struct NormalizePathNormalization<S> {
service: S,
merge_slash: Regex,
}
impl<S, B> Service for NormalizePathNormalization<S>
where
S: Service<Request = ServiceRequest, Response = ServiceResponse<B>, Error = Error>,
S::Future: 'static,
{
type Request = ServiceRequest;
type Response = ServiceResponse<B>;
type Error = Error;
type Future = S::Future;
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
self.service.poll_ready(cx)
}
fn call(&mut self, mut req: ServiceRequest) -> Self::Future {
let head = req.head_mut();
// always add trailing slash, might be an extra one
let path = head.uri.path().to_string() + "/";
if self.merge_slash.find(&path).is_some() {
// normalize multiple /'s to one /
let path = self.merge_slash.replace_all(&path, "/");
let path = if path.len() > 1 {
path.trim_end_matches('/')
} else {
&path
};
let mut parts = head.uri.clone().into_parts();
let pq = parts.path_and_query.as_ref().unwrap();
let path = if let Some(q) = pq.query() {
bytes::Bytes::from(format!("{}?{}", path, q))
} else {
bytes::Bytes::copy_from_slice(path.as_bytes())
};
parts.path_and_query = Some(PathAndQuery::from_maybe_shared(path).unwrap());
let uri = Uri::from_parts(parts).unwrap();
req.match_info_mut().get_mut().update(&uri);
req.head_mut().uri = uri;
}
self.service.call(req)
}
}

View File

@ -0,0 +1,134 @@
use std::fs::{create_dir_all, File};
use std::io::{BufRead, BufReader, Write};
use std::path::Path;
use std::sync::Arc;
use anyhow::{bail, Context};
use heed::RoTxn;
use indexmap::IndexMap;
use milli::update::{IndexDocumentsMethod, UpdateFormat::JsonStream};
use serde::{Deserialize, Serialize};
use crate::option::IndexerOpts;
use super::error::Result;
use super::{update_handler::UpdateHandler, Index, Settings, Unchecked};
#[derive(Serialize, Deserialize)]
struct DumpMeta {
settings: Settings<Unchecked>,
primary_key: Option<String>,
}
const META_FILE_NAME: &str = "meta.json";
const DATA_FILE_NAME: &str = "documents.jsonl";
impl Index {
pub fn dump(&self, path: impl AsRef<Path>) -> Result<()> {
// acquire write txn make sure any ongoing write is finished before we start.
let txn = self.env.write_txn()?;
self.dump_documents(&txn, &path)?;
self.dump_meta(&txn, &path)?;
Ok(())
}
fn dump_documents(&self, txn: &RoTxn, path: impl AsRef<Path>) -> Result<()> {
let document_file_path = path.as_ref().join(DATA_FILE_NAME);
let mut document_file = File::create(&document_file_path)?;
let documents = self.all_documents(txn)?;
let fields_ids_map = self.fields_ids_map(txn)?;
// dump documents
let mut json_map = IndexMap::new();
for document in documents {
let (_, reader) = document?;
for (fid, bytes) in reader.iter() {
if let Some(name) = fields_ids_map.name(fid) {
json_map.insert(name, serde_json::from_slice::<serde_json::Value>(bytes)?);
}
}
serde_json::to_writer(&mut document_file, &json_map)?;
document_file.write_all(b"\n")?;
json_map.clear();
}
Ok(())
}
fn dump_meta(&self, txn: &RoTxn, path: impl AsRef<Path>) -> Result<()> {
let meta_file_path = path.as_ref().join(META_FILE_NAME);
let mut meta_file = File::create(&meta_file_path)?;
let settings = self.settings_txn(txn)?.into_unchecked();
let primary_key = self.primary_key(txn)?.map(String::from);
let meta = DumpMeta {
settings,
primary_key,
};
serde_json::to_writer(&mut meta_file, &meta)?;
Ok(())
}
pub fn load_dump(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
let dir_name = src
.as_ref()
.file_name()
.with_context(|| format!("invalid dump index: {}", src.as_ref().display()))?;
let dst_dir_path = dst.as_ref().join("indexes").join(dir_name);
create_dir_all(&dst_dir_path)?;
let meta_path = src.as_ref().join(META_FILE_NAME);
let mut meta_file = File::open(meta_path)?;
let DumpMeta {
settings,
primary_key,
} = serde_json::from_reader(&mut meta_file)?;
let settings = settings.check();
let index = Self::open(&dst_dir_path, size)?;
let mut txn = index.write_txn()?;
let handler = UpdateHandler::new(&indexing_options)?;
index.update_settings_txn(&mut txn, &settings, handler.update_builder(0))?;
let document_file_path = src.as_ref().join(DATA_FILE_NAME);
let reader = File::open(&document_file_path)?;
let mut reader = BufReader::new(reader);
reader.fill_buf()?;
// If the document file is empty, we don't perform the document addition, to prevent
// a primary key error to be thrown.
if !reader.buffer().is_empty() {
index.update_documents_txn(
&mut txn,
JsonStream,
IndexDocumentsMethod::UpdateDocuments,
Some(reader),
handler.update_builder(0),
primary_key.as_deref(),
)?;
}
txn.commit()?;
match Arc::try_unwrap(index.0) {
Ok(inner) => inner.prepare_for_closing().wait(),
Err(_) => bail!("Could not close index properly."),
}
Ok(())
}
}

View File

@ -0,0 +1,52 @@
use std::error::Error;
use meilisearch_error::{Code, ErrorCode};
use serde_json::Value;
use crate::error::MilliError;
pub type Result<T> = std::result::Result<T, IndexError>;
#[derive(Debug, thiserror::Error)]
pub enum IndexError {
#[error("Internal error: {0}")]
Internal(Box<dyn Error + Send + Sync + 'static>),
#[error("Document with id {0} not found.")]
DocumentNotFound(String),
#[error("{0}")]
Facet(#[from] FacetError),
#[error("{0}")]
Milli(#[from] milli::Error),
}
internal_error!(
IndexError: std::io::Error,
heed::Error,
fst::Error,
serde_json::Error
);
impl ErrorCode for IndexError {
fn error_code(&self) -> Code {
match self {
IndexError::Internal(_) => Code::Internal,
IndexError::DocumentNotFound(_) => Code::DocumentNotFound,
IndexError::Facet(e) => e.error_code(),
IndexError::Milli(e) => MilliError(e).error_code(),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum FacetError {
#[error("Invalid facet expression, expected {}, found: {1}", .0.join(", "))]
InvalidExpression(&'static [&'static str], Value),
}
impl ErrorCode for FacetError {
fn error_code(&self) -> Code {
match self {
FacetError::InvalidExpression(_, _) => Code::Facet,
}
}
}

View File

@ -0,0 +1,194 @@
use std::collections::{BTreeSet, HashSet};
use std::fs::create_dir_all;
use std::marker::PhantomData;
use std::ops::Deref;
use std::path::Path;
use std::sync::Arc;
use heed::{EnvOpenOptions, RoTxn};
use milli::obkv_to_json;
use serde::{de::Deserializer, Deserialize};
use serde_json::{Map, Value};
use crate::helpers::EnvSizer;
use error::Result;
pub use search::{default_crop_length, SearchQuery, SearchResult, DEFAULT_SEARCH_LIMIT};
pub use updates::{Checked, Facets, Settings, Unchecked};
use self::error::IndexError;
pub mod error;
pub mod update_handler;
mod dump;
mod search;
mod updates;
pub type Document = Map<String, Value>;
#[derive(Clone)]
pub struct Index(pub Arc<milli::Index>);
impl Deref for Index {
type Target = milli::Index;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
pub fn deserialize_some<'de, T, D>(deserializer: D) -> std::result::Result<Option<T>, D::Error>
where
T: Deserialize<'de>,
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(Some)
}
impl Index {
pub fn open(path: impl AsRef<Path>, size: usize) -> Result<Self> {
create_dir_all(&path)?;
let mut options = EnvOpenOptions::new();
options.map_size(size);
let index = milli::Index::new(options, &path)?;
Ok(Index(Arc::new(index)))
}
pub fn settings(&self) -> Result<Settings<Checked>> {
let txn = self.read_txn()?;
self.settings_txn(&txn)
}
pub fn settings_txn(&self, txn: &RoTxn) -> Result<Settings<Checked>> {
let displayed_attributes = self
.displayed_fields(&txn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let searchable_attributes = self
.searchable_fields(&txn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let faceted_attributes = self.faceted_fields(&txn)?.into_iter().collect();
let criteria = self
.criteria(&txn)?
.into_iter()
.map(|c| c.to_string())
.collect();
let stop_words = self
.stop_words(&txn)?
.map(|stop_words| -> Result<BTreeSet<_>> {
Ok(stop_words.stream().into_strs()?.into_iter().collect())
})
.transpose()?
.unwrap_or_else(BTreeSet::new);
let distinct_field = self.distinct_field(&txn)?.map(String::from);
// in milli each word in the synonyms map were split on their separator. Since we lost
// this information we are going to put space between words.
let synonyms = self
.synonyms(&txn)?
.iter()
.map(|(key, values)| {
(
key.join(" "),
values.iter().map(|value| value.join(" ")).collect(),
)
})
.collect();
Ok(Settings {
displayed_attributes: Some(displayed_attributes),
searchable_attributes: Some(searchable_attributes),
filterable_attributes: Some(Some(faceted_attributes)),
ranking_rules: Some(Some(criteria)),
stop_words: Some(Some(stop_words)),
distinct_attribute: Some(distinct_field),
synonyms: Some(Some(synonyms)),
_kind: PhantomData,
})
}
pub fn retrieve_documents<S: AsRef<str>>(
&self,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Vec<Map<String, Value>>> {
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
let fields_to_display =
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit);
let mut documents = Vec::new();
for entry in iter {
let (_id, obkv) = entry?;
let object = obkv_to_json(&fields_to_display, &fields_ids_map, obkv)?;
documents.push(object);
}
Ok(documents)
}
pub fn retrieve_document<S: AsRef<str>>(
&self,
doc_id: String,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Map<String, Value>> {
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
let fields_to_display =
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
let internal_id = self
.external_documents_ids(&txn)?
.get(doc_id.as_bytes())
.ok_or_else(|| IndexError::DocumentNotFound(doc_id.clone()))?;
let document = self
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next()
.map(|(_, d)| d)
.ok_or(IndexError::DocumentNotFound(doc_id))?;
let document = obkv_to_json(&fields_to_display, &fields_ids_map, document)?;
Ok(document)
}
pub fn size(&self) -> u64 {
self.env.size()
}
fn fields_to_display<S: AsRef<str>>(
&self,
txn: &heed::RoTxn,
attributes_to_retrieve: &Option<Vec<S>>,
fields_ids_map: &milli::FieldsIdsMap,
) -> Result<Vec<u8>> {
let mut displayed_fields_ids = match self.displayed_fields_ids(&txn)? {
Some(ids) => ids.into_iter().collect::<Vec<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
let attributes_to_retrieve_ids = match attributes_to_retrieve {
Some(attrs) => attrs
.iter()
.filter_map(|f| fields_ids_map.id(f.as_ref()))
.collect::<HashSet<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
displayed_fields_ids.retain(|fid| attributes_to_retrieve_ids.contains(fid));
Ok(displayed_fields_ids)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,92 @@
use std::fs::File;
use crate::index::Index;
use grenad::CompressionType;
use milli::update::UpdateBuilder;
use rayon::ThreadPool;
use crate::index_controller::UpdateMeta;
use crate::index_controller::{Failed, Processed, Processing};
use crate::option::IndexerOpts;
pub struct UpdateHandler {
max_nb_chunks: Option<usize>,
chunk_compression_level: Option<u32>,
thread_pool: ThreadPool,
log_frequency: usize,
max_memory: usize,
linked_hash_map_size: usize,
chunk_compression_type: CompressionType,
chunk_fusing_shrink_size: u64,
}
impl UpdateHandler {
pub fn new(opt: &IndexerOpts) -> anyhow::Result<Self> {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(opt.indexing_jobs.unwrap_or(num_cpus::get() / 2))
.build()?;
Ok(Self {
max_nb_chunks: opt.max_nb_chunks,
chunk_compression_level: opt.chunk_compression_level,
thread_pool,
log_frequency: opt.log_every_n,
max_memory: opt.max_memory.get_bytes() as usize,
linked_hash_map_size: opt.linked_hash_map_size,
chunk_compression_type: opt.chunk_compression_type,
chunk_fusing_shrink_size: opt.chunk_fusing_shrink_size.get_bytes(),
})
}
pub fn update_builder(&self, update_id: u64) -> UpdateBuilder {
// We prepare the update by using the update builder.
let mut update_builder = UpdateBuilder::new(update_id);
if let Some(max_nb_chunks) = self.max_nb_chunks {
update_builder.max_nb_chunks(max_nb_chunks);
}
if let Some(chunk_compression_level) = self.chunk_compression_level {
update_builder.chunk_compression_level(chunk_compression_level);
}
update_builder.thread_pool(&self.thread_pool);
update_builder.log_every_n(self.log_frequency);
update_builder.max_memory(self.max_memory);
update_builder.linked_hash_map_size(self.linked_hash_map_size);
update_builder.chunk_compression_type(self.chunk_compression_type);
update_builder.chunk_fusing_shrink_size(self.chunk_fusing_shrink_size);
update_builder
}
pub fn handle_update(
&self,
meta: Processing,
content: Option<File>,
index: Index,
) -> Result<Processed, Failed> {
use UpdateMeta::*;
let update_id = meta.id();
let update_builder = self.update_builder(update_id);
let result = match meta.meta() {
DocumentsAddition {
method,
format,
primary_key,
} => index.update_documents(
*format,
*method,
content,
update_builder,
primary_key.as_deref(),
),
ClearDocuments => index.clear_documents(update_builder),
DeleteDocuments { ids } => index.delete_documents(ids, update_builder),
Settings(settings) => index.update_settings(&settings.clone().check(), update_builder),
};
match result {
Ok(result) => Ok(meta.process(result)),
Err(e) => Err(meta.fail(e.into())),
}
}
}

View File

@ -0,0 +1,382 @@
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::io;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use flate2::read::GzDecoder;
use log::{debug, info, trace};
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use serde::{Deserialize, Serialize, Serializer};
use crate::index_controller::UpdateResult;
use super::error::Result;
use super::{deserialize_some, Index};
fn serialize_with_wildcard<S>(
field: &Option<Option<Vec<String>>>,
s: S,
) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let wildcard = vec!["*".to_string()];
s.serialize_some(&field.as_ref().map(|o| o.as_ref().unwrap_or(&wildcard)))
}
#[derive(Clone, Default, Debug, Serialize)]
pub struct Checked;
#[derive(Clone, Default, Debug, Serialize, Deserialize)]
pub struct Unchecked;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))]
pub struct Settings<T> {
#[serde(
default,
deserialize_with = "deserialize_some",
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Option::is_none"
)]
pub displayed_attributes: Option<Option<Vec<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Option::is_none"
)]
pub searchable_attributes: Option<Option<Vec<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub filterable_attributes: Option<Option<HashSet<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub ranking_rules: Option<Option<Vec<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub stop_words: Option<Option<BTreeSet<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub synonyms: Option<Option<BTreeMap<String, Vec<String>>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub distinct_attribute: Option<Option<String>>,
#[serde(skip)]
pub _kind: PhantomData<T>,
}
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
displayed_attributes: Some(None),
searchable_attributes: Some(None),
filterable_attributes: Some(None),
ranking_rules: Some(None),
stop_words: Some(None),
synonyms: Some(None),
distinct_attribute: Some(None),
_kind: PhantomData,
}
}
pub fn into_unchecked(self) -> Settings<Unchecked> {
let Self {
displayed_attributes,
searchable_attributes,
filterable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
..
} = self;
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
_kind: PhantomData,
}
}
}
impl Settings<Unchecked> {
pub fn check(mut self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes.take() {
Some(Some(fields)) => {
if fields.iter().any(|f| f == "*") {
Some(None)
} else {
Some(Some(fields))
}
}
otherwise => otherwise,
};
let searchable_attributes = match self.searchable_attributes.take() {
Some(Some(fields)) => {
if fields.iter().any(|f| f == "*") {
Some(None)
} else {
Some(Some(fields))
}
}
otherwise => otherwise,
};
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes: self.filterable_attributes,
ranking_rules: self.ranking_rules,
stop_words: self.stop_words,
synonyms: self.synonyms,
distinct_attribute: self.distinct_attribute,
_kind: PhantomData,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Facets {
pub level_group_size: Option<NonZeroUsize>,
pub min_level_size: Option<NonZeroUsize>,
}
impl Index {
pub fn update_documents(
&self,
format: UpdateFormat,
method: IndexDocumentsMethod,
content: Option<impl io::Read>,
update_builder: UpdateBuilder,
primary_key: Option<&str>,
) -> Result<UpdateResult> {
let mut txn = self.write_txn()?;
let result = self.update_documents_txn(
&mut txn,
format,
method,
content,
update_builder,
primary_key,
)?;
txn.commit()?;
Ok(result)
}
pub fn update_documents_txn<'a, 'b>(
&'a self,
txn: &mut heed::RwTxn<'a, 'b>,
format: UpdateFormat,
method: IndexDocumentsMethod,
content: Option<impl io::Read>,
update_builder: UpdateBuilder,
primary_key: Option<&str>,
) -> Result<UpdateResult> {
trace!("performing document addition");
// Set the primary key if not set already, ignore if already set.
if let (None, Some(primary_key)) = (self.primary_key(txn)?, primary_key) {
let mut builder = UpdateBuilder::new(0).settings(txn, &self);
builder.set_primary_key(primary_key.to_string());
builder.execute(|_, _| ())?;
}
let mut builder = update_builder.index_documents(txn, self);
builder.update_format(format);
builder.index_documents_method(method);
let indexing_callback =
|indexing_step, update_id| debug!("update {}: {:?}", update_id, indexing_step);
let gzipped = false;
let addition = match content {
Some(content) if gzipped => {
builder.execute(GzDecoder::new(content), indexing_callback)?
}
Some(content) => builder.execute(content, indexing_callback)?,
None => builder.execute(std::io::empty(), indexing_callback)?,
};
info!("document addition done: {:?}", addition);
Ok(UpdateResult::DocumentsAddition(addition))
}
pub fn clear_documents(&self, update_builder: UpdateBuilder) -> Result<UpdateResult> {
// We must use the write transaction of the update here.
let mut wtxn = self.write_txn()?;
let builder = update_builder.clear_documents(&mut wtxn, self);
let _count = builder.execute()?;
wtxn.commit()
.and(Ok(UpdateResult::Other))
.map_err(Into::into)
}
pub fn update_settings_txn<'a, 'b>(
&'a self,
txn: &mut heed::RwTxn<'a, 'b>,
settings: &Settings<Checked>,
update_builder: UpdateBuilder,
) -> Result<UpdateResult> {
// We must use the write transaction of the update here.
let mut builder = update_builder.settings(txn, self);
if let Some(ref names) = settings.searchable_attributes {
match names {
Some(names) => builder.set_searchable_fields(names.clone()),
None => builder.reset_searchable_fields(),
}
}
if let Some(ref names) = settings.displayed_attributes {
match names {
Some(names) => builder.set_displayed_fields(names.clone()),
None => builder.reset_displayed_fields(),
}
}
if let Some(ref facet_types) = settings.filterable_attributes {
let facet_types = facet_types.clone().unwrap_or_else(HashSet::new);
builder.set_filterable_fields(facet_types);
}
if let Some(ref criteria) = settings.ranking_rules {
match criteria {
Some(criteria) => builder.set_criteria(criteria.clone()),
None => builder.reset_criteria(),
}
}
if let Some(ref stop_words) = settings.stop_words {
match stop_words {
Some(stop_words) => builder.set_stop_words(stop_words.clone()),
None => builder.reset_stop_words(),
}
}
if let Some(ref synonyms) = settings.synonyms {
match synonyms {
Some(synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
None => builder.reset_synonyms(),
}
}
if let Some(ref distinct_attribute) = settings.distinct_attribute {
match distinct_attribute {
Some(attr) => builder.set_distinct_field(attr.clone()),
None => builder.reset_distinct_field(),
}
}
builder.execute(|indexing_step, update_id| {
debug!("update {}: {:?}", update_id, indexing_step)
})?;
Ok(UpdateResult::Other)
}
pub fn update_settings(
&self,
settings: &Settings<Checked>,
update_builder: UpdateBuilder,
) -> Result<UpdateResult> {
let mut txn = self.write_txn()?;
let result = self.update_settings_txn(&mut txn, settings, update_builder)?;
txn.commit()?;
Ok(result)
}
pub fn delete_documents(
&self,
document_ids: &[String],
update_builder: UpdateBuilder,
) -> Result<UpdateResult> {
let mut txn = self.write_txn()?;
let mut builder = update_builder.delete_documents(&mut txn, self)?;
// We ignore unexisting document ids
document_ids.iter().for_each(|id| {
builder.delete_external_id(id);
});
let deleted = builder.execute()?;
txn.commit()
.and(Ok(UpdateResult::DocumentDeletion { deleted }))
.map_err(Into::into)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_setting_check() {
// test no changes
let settings = Settings {
displayed_attributes: Some(Some(vec![String::from("hello")])),
searchable_attributes: Some(Some(vec![String::from("hello")])),
filterable_attributes: None,
ranking_rules: None,
stop_words: None,
synonyms: None,
distinct_attribute: None,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.clone().check();
assert_eq!(settings.displayed_attributes, checked.displayed_attributes);
assert_eq!(
settings.searchable_attributes,
checked.searchable_attributes
);
// test wildcard
// test no changes
let settings = Settings {
displayed_attributes: Some(Some(vec![String::from("*")])),
searchable_attributes: Some(Some(vec![String::from("hello"), String::from("*")])),
filterable_attributes: None,
ranking_rules: None,
stop_words: None,
synonyms: None,
distinct_attribute: None,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.check();
assert_eq!(checked.displayed_attributes, Some(None));
assert_eq!(checked.searchable_attributes, Some(None));
}
}

View File

@ -0,0 +1,157 @@
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use async_stream::stream;
use chrono::Utc;
use futures::{lock::Mutex, stream::StreamExt};
use log::{error, trace};
use tokio::sync::{mpsc, oneshot, RwLock};
use update_actor::UpdateActorHandle;
use uuid_resolver::UuidResolverHandle;
use super::error::{DumpActorError, Result};
use super::{DumpInfo, DumpMsg, DumpStatus, DumpTask};
use crate::index_controller::{update_actor, uuid_resolver};
pub const CONCURRENT_DUMP_MSG: usize = 10;
pub struct DumpActor<UuidResolver, Update> {
inbox: Option<mpsc::Receiver<DumpMsg>>,
uuid_resolver: UuidResolver,
update: Update,
dump_path: PathBuf,
lock: Arc<Mutex<()>>,
dump_infos: Arc<RwLock<HashMap<String, DumpInfo>>>,
update_db_size: usize,
index_db_size: usize,
}
/// Generate uid from creation date
fn generate_uid() -> String {
Utc::now().format("%Y%m%d-%H%M%S%3f").to_string()
}
impl<UuidResolver, Update> DumpActor<UuidResolver, Update>
where
UuidResolver: UuidResolverHandle + Send + Sync + Clone + 'static,
Update: UpdateActorHandle + Send + Sync + Clone + 'static,
{
pub fn new(
inbox: mpsc::Receiver<DumpMsg>,
uuid_resolver: UuidResolver,
update: Update,
dump_path: impl AsRef<Path>,
index_db_size: usize,
update_db_size: usize,
) -> Self {
let dump_infos = Arc::new(RwLock::new(HashMap::new()));
let lock = Arc::new(Mutex::new(()));
Self {
inbox: Some(inbox),
uuid_resolver,
update,
dump_path: dump_path.as_ref().into(),
dump_infos,
lock,
index_db_size,
update_db_size,
}
}
pub async fn run(mut self) {
trace!("Started dump actor.");
let mut inbox = self
.inbox
.take()
.expect("Dump Actor must have a inbox at this point.");
let stream = stream! {
loop {
match inbox.recv().await {
Some(msg) => yield msg,
None => break,
}
}
};
stream
.for_each_concurrent(Some(CONCURRENT_DUMP_MSG), |msg| self.handle_message(msg))
.await;
error!("Dump actor stopped.");
}
async fn handle_message(&self, msg: DumpMsg) {
use DumpMsg::*;
match msg {
CreateDump { ret } => {
let _ = self.handle_create_dump(ret).await;
}
DumpInfo { ret, uid } => {
let _ = ret.send(self.handle_dump_info(uid).await);
}
}
}
async fn handle_create_dump(&self, ret: oneshot::Sender<Result<DumpInfo>>) {
let uid = generate_uid();
let info = DumpInfo::new(uid.clone(), DumpStatus::InProgress);
let _lock = match self.lock.try_lock() {
Some(lock) => lock,
None => {
ret.send(Err(DumpActorError::DumpAlreadyRunning))
.expect("Dump actor is dead");
return;
}
};
self.dump_infos
.write()
.await
.insert(uid.clone(), info.clone());
ret.send(Ok(info)).expect("Dump actor is dead");
let task = DumpTask {
path: self.dump_path.clone(),
uuid_resolver: self.uuid_resolver.clone(),
update_handle: self.update.clone(),
uid: uid.clone(),
update_db_size: self.update_db_size,
index_db_size: self.index_db_size,
};
let task_result = tokio::task::spawn(task.run()).await;
let mut dump_infos = self.dump_infos.write().await;
let dump_infos = dump_infos
.get_mut(&uid)
.expect("dump entry deleted while lock was acquired");
match task_result {
Ok(Ok(())) => {
dump_infos.done();
trace!("Dump succeed");
}
Ok(Err(e)) => {
dump_infos.with_error(e.to_string());
error!("Dump failed: {}", e);
}
Err(_) => {
dump_infos.with_error("Unexpected error while performing dump.".to_string());
error!("Dump panicked. Dump status set to failed");
}
};
}
async fn handle_dump_info(&self, uid: String) -> Result<DumpInfo> {
match self.dump_infos.read().await.get(&uid) {
Some(info) => Ok(info.clone()),
_ => Err(DumpActorError::DumpDoesNotExist(uid)),
}
}
}

Some files were not shown because too many files have changed in this diff Show More