From f456fb5e0ba50a6be0081435711cc1d2eed5a64c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 11 Oct 2022 17:42:43 +0200 Subject: [PATCH] get rids of the index crate + the document_types crate --- Cargo.lock | 327 +------ Cargo.toml | 2 - document-formats/Cargo.toml | 14 - document-formats/src/lib.rs | 155 ---- dump/Cargo.toml | 1 - index-scheduler/Cargo.toml | 3 - index-scheduler/src/autobatcher.rs | 4 +- index-scheduler/src/batch.rs | 27 +- index-scheduler/src/error.rs | 6 +- index-scheduler/src/index_mapper.rs | 13 +- index-scheduler/src/lib.rs | 18 +- index-scheduler/src/snapshot.rs | 10 +- index-scheduler/src/task.rs | 6 +- index-scheduler/src/utils.rs | 6 +- index/Cargo.toml | 33 - index/src/dump.rs | 160 ---- index/src/error.rs | 122 --- index/src/search.rs | 869 ------------------ index/src/updates.rs | 429 --------- meilisearch-http/Cargo.toml | 14 +- meilisearch-http/src/error.rs | 18 +- meilisearch-http/src/lib.rs | 4 +- meilisearch-http/src/option.rs | 6 +- .../src/routes/indexes/documents.rs | 100 +- meilisearch-http/src/routes/indexes/mod.rs | 4 +- meilisearch-http/src/routes/indexes/search.rs | 8 +- .../src/routes/indexes/settings.rs | 140 ++- meilisearch-http/src/routes/mod.rs | 2 +- meilisearch-types/Cargo.toml | 14 +- meilisearch-types/src/lib.rs | 8 + 30 files changed, 316 insertions(+), 2207 deletions(-) delete mode 100644 document-formats/Cargo.toml delete mode 100644 document-formats/src/lib.rs delete mode 100644 index/Cargo.toml delete mode 100644 index/src/dump.rs delete mode 100644 index/src/error.rs delete mode 100644 index/src/search.rs delete mode 100644 index/src/updates.rs diff --git a/Cargo.lock b/Cargo.lock index ff962f19a..a70a183d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -355,12 +355,6 @@ dependencies = [ "critical-section", ] -[[package]] -name = "atomic_refcell" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b5e5f48b927f04e952dedc932f31995a65a0bf65ec971c74436e51bf6e970d" - [[package]] name = "atty" version = "0.2.14" @@ -1023,17 +1017,6 @@ dependencies = [ "syn 1.0.101", ] -[[package]] -name = "derivative" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" -dependencies = [ - "proc-macro2 1.0.46", - "quote 1.0.21", - "syn 1.0.101", -] - [[package]] name = "derive_builder" version = "0.11.2" @@ -1084,12 +1067,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08ff6a4480d42625e59bc4e8b5dc3723279fd24d83afe8aa20df217276261cd6" -[[package]] -name = "difflib" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" - [[package]] name = "digest" version = "0.10.5" @@ -1122,24 +1099,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "document-formats" -version = "0.1.0" -dependencies = [ - "csv", - "either", - "meilisearch-types", - "milli 0.33.0", - "serde", - "serde_json", -] - -[[package]] -name = "downcast" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" - [[package]] name = "dump" version = "0.29.0" @@ -1148,7 +1107,6 @@ dependencies = [ "big_s", "flate2", "http", - "index", "index-scheduler", "insta", "log", @@ -1349,15 +1307,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "filter-parser" -version = "0.33.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.33.0#a79ff8a1a98a807f40f970131c8de2ab11560de5" -dependencies = [ - "nom", - "nom_locate", -] - [[package]] name = "filter-parser" version = "0.33.4" @@ -1377,14 +1326,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "flatten-serde-json" -version = "0.33.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.33.0#a79ff8a1a98a807f40f970131c8de2ab11560de5" -dependencies = [ - "serde_json", -] - [[package]] name = "flatten-serde-json" version = "0.33.4" @@ -1393,15 +1334,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "float-cmp" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" -dependencies = [ - "num-traits", -] - [[package]] name = "fnv" version = "1.0.7" @@ -1417,18 +1349,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fragile" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85dcb89d2b10c5f6133de2efd8c11959ce9dbb46a2f7a4cab208c4eeda6ce1ab" - -[[package]] -name = "fs_extra" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" - [[package]] name = "fst" version = "0.4.7" @@ -1822,35 +1742,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "index" -version = "0.1.0" -dependencies = [ - "anyhow", - "bincode", - "csv", - "derivative", - "either", - "file-store", - "fst", - "indexmap", - "lazy_static", - "log", - "meilisearch-types", - "milli 0.33.0", - "nelson", - "obkv", - "permissive-json-pointer", - "proptest", - "proptest-derive", - "regex", - "serde", - "serde_json", - "thiserror", - "time", - "uuid 1.1.2", -] - [[package]] name = "index-scheduler" version = "0.1.0" @@ -1861,13 +1752,10 @@ dependencies = [ "crossbeam", "csv", "derive_builder", - "document-formats", "file-store", - "index", "insta", "log", "meilisearch-types", - "milli 0.33.0", "nelson", "roaring", "serde", @@ -1975,14 +1863,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "json-depth-checker" -version = "0.33.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.33.0#a79ff8a1a98a807f40f970131c8de2ab11560de5" -dependencies = [ - "serde_json", -] - [[package]] name = "json-depth-checker" version = "0.33.4" @@ -2338,7 +2218,7 @@ dependencies = [ "enum-iterator", "hmac", "meilisearch-types", - "milli 0.33.4", + "milli", "rand", "serde", "serde_json", @@ -2368,7 +2248,6 @@ dependencies = [ "cargo_toml", "clap 4.0.9", "crossbeam-channel", - "document-formats", "either", "env_logger", "file-store", @@ -2378,7 +2257,6 @@ dependencies = [ "futures-util", "hex", "http", - "index", "index-scheduler", "indexmap", "itertools", @@ -2388,7 +2266,6 @@ dependencies = [ "manifest-dir-macros", "maplit", "meilisearch-auth", - "meilisearch-lib", "meilisearch-types", "mimalloc", "mime", @@ -2396,6 +2273,7 @@ dependencies = [ "obkv", "once_cell", "parking_lot", + "permissive-json-pointer", "pin-project-lite", "platform-dirs", "prometheus", @@ -2431,78 +2309,14 @@ dependencies = [ "zip", ] -[[package]] -name = "meilisearch-lib" -version = "0.29.1" -dependencies = [ - "actix-rt", - "actix-web", - "anyhow", - "async-stream", - "async-trait", - "atomic_refcell", - "byte-unit", - "bytes", - "clap 4.0.9", - "crossbeam-channel", - "csv", - "derivative", - "either", - "file-store", - "flate2", - "fs_extra", - "fst", - "futures", - "futures-util", - "http", - "index", - "index-scheduler", - "indexmap", - "itertools", - "lazy_static", - "log", - "meilisearch-auth", - "meilisearch-types", - "milli 0.33.4", - "mime", - "mockall", - "nelson", - "num_cpus", - "obkv", - "once_cell", - "page_size", - "parking_lot", - "paste", - "permissive-json-pointer", - "proptest", - "proptest-derive", - "rand", - "rayon", - "regex", - "reqwest", - "roaring", - "rustls", - "serde", - "serde_json", - "siphasher", - "slice-group-by", - "sysinfo", - "tar", - "tempfile", - "thiserror", - "time", - "tokio", - "uuid 1.1.2", - "walkdir", - "whoami", -] - [[package]] name = "meilisearch-types" version = "0.29.1" dependencies = [ "actix-web", - "milli 0.33.0", + "csv", + "either", + "milli", "proptest", "proptest-derive", "serde", @@ -2534,51 +2348,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "milli" -version = "0.33.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.33.0#a79ff8a1a98a807f40f970131c8de2ab11560de5" -dependencies = [ - "bimap", - "bincode", - "bstr 0.2.17", - "byteorder", - "charabia", - "concat-arrays", - "crossbeam-channel", - "csv", - "either", - "filter-parser 0.33.0", - "flatten-serde-json 0.33.0", - "fst", - "fxhash", - "geoutils", - "grenad", - "heed", - "itertools", - "json-depth-checker 0.33.0", - "levenshtein_automata", - "log", - "logging_timer", - "memmap2", - "obkv", - "once_cell", - "ordered-float", - "rayon", - "roaring", - "rstar", - "serde", - "serde_json", - "slice-group-by", - "smallstr", - "smallvec", - "smartstring", - "tempfile", - "thiserror", - "time", - "uuid 1.1.2", -] - [[package]] name = "milli" version = "0.33.4" @@ -2593,15 +2362,15 @@ dependencies = [ "crossbeam-channel", "csv", "either", - "filter-parser 0.33.4", - "flatten-serde-json 0.33.4", + "filter-parser", + "flatten-serde-json", "fst", "fxhash", "geoutils", "grenad", "heed", "itertools", - "json-depth-checker 0.33.4", + "json-depth-checker", "levenshtein_automata", "log", "logging_timer", @@ -2676,33 +2445,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "mockall" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2be9a9090bc1cac2930688fa9478092a64c6a92ddc6ae0692d46b37d9cab709" -dependencies = [ - "cfg-if", - "downcast", - "fragile", - "lazy_static", - "mockall_derive", - "predicates", - "predicates-tree", -] - -[[package]] -name = "mockall_derive" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86d702a0530a0141cf4ed147cf5ec7be6f2c187d4e37fcbefc39cf34116bfe8f" -dependencies = [ - "cfg-if", - "proc-macro2 1.0.46", - "quote 1.0.21", - "syn 1.0.101", -] - [[package]] name = "nb" version = "0.1.3" @@ -2744,12 +2486,6 @@ dependencies = [ "nom", ] -[[package]] -name = "normalize-line-endings" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" - [[package]] name = "ntapi" version = "0.4.0" @@ -3066,36 +2802,6 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" -[[package]] -name = "predicates" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5aab5be6e4732b473071984b3164dbbfb7a3674d30ea5ff44410b6bcd960c3c" -dependencies = [ - "difflib", - "float-cmp", - "itertools", - "normalize-line-endings", - "predicates-core", - "regex", -] - -[[package]] -name = "predicates-core" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da1c2388b1513e1b605fcec39a95e0a9e8ef088f71443ef37099fa9ae6673fcb" - -[[package]] -name = "predicates-tree" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d86de6de25020a36c6d3643a86d9a6a9f552107c0559c60ea03551b5e16c032" -dependencies = [ - "predicates-core", - "termtree", -] - [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3915,12 +3621,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "termtree" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "507e9898683b6c43a9aa55b64259b721b52ba226e0f3779137e50ad114a4c90b" - [[package]] name = "textwrap" version = "0.15.1" @@ -4387,17 +4087,6 @@ dependencies = [ "hashbrown 0.7.2", ] -[[package]] -name = "whoami" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6631b6a2fd59b1841b622e8f1a7ad241ef0a46f2d580464ce8140ac94cbd571" -dependencies = [ - "bumpalo", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index eaf930a33..a17e7a170 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,6 @@ members = [ "meilisearch-types", "meilisearch-auth", "index-scheduler", - "document-formats", - "index", "dump", "file-store", "permissive-json-pointer", diff --git a/document-formats/Cargo.toml b/document-formats/Cargo.toml deleted file mode 100644 index 7f923dea4..000000000 --- a/document-formats/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "document-formats" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -csv = "1.1.6" -meilisearch-types = { path = "../meilisearch-types" } -either = { version = "1.6.1", features = ["serde"] } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } -serde_json = { version = "1.0.85", features = ["preserve_order"] } -serde = { version = "1.0.136", features = ["derive"] } diff --git a/document-formats/src/lib.rs b/document-formats/src/lib.rs deleted file mode 100644 index ebc98f3fb..000000000 --- a/document-formats/src/lib.rs +++ /dev/null @@ -1,155 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug, Display}; -use std::io::{self, BufReader, Read, Seek, Write}; - -use either::Either; -use meilisearch_types::error::{Code, ErrorCode}; -use meilisearch_types::internal_error; -use milli::documents::{DocumentsBatchBuilder, Error}; -use milli::Object; -use serde::Deserialize; - -type Result = std::result::Result; - -#[derive(Debug)] -pub enum PayloadType { - Ndjson, - Json, - Csv, -} - -impl fmt::Display for PayloadType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PayloadType::Ndjson => f.write_str("ndjson"), - PayloadType::Json => f.write_str("json"), - PayloadType::Csv => f.write_str("csv"), - } - } -} - -#[derive(Debug)] -pub enum DocumentFormatError { - Internal(Box), - MalformedPayload(Error, PayloadType), -} - -impl Display for DocumentFormatError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e), - Self::MalformedPayload(me, b) => match me.borrow() { - Error::Json(se) => { - // https://github.com/meilisearch/meilisearch/issues/2107 - // The user input maybe insanely long. We need to truncate it. - let mut serde_msg = se.to_string(); - let ellipsis = "..."; - if serde_msg.len() > 100 + ellipsis.len() { - serde_msg.replace_range(50..serde_msg.len() - 85, ellipsis); - } - - write!( - f, - "The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.", - b, serde_msg - ) - } - _ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me), - }, - } - } -} - -impl std::error::Error for DocumentFormatError {} - -impl From<(PayloadType, Error)> for DocumentFormatError { - fn from((ty, error): (PayloadType, Error)) -> Self { - match error { - Error::Io(e) => Self::Internal(Box::new(e)), - e => Self::MalformedPayload(e, ty), - } - } -} - -impl ErrorCode for DocumentFormatError { - fn error_code(&self) -> Code { - match self { - DocumentFormatError::Internal(_) => Code::Internal, - DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload, - } - } -} - -internal_error!(DocumentFormatError: io::Error); - -/// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - - let csv = csv::Reader::from_reader(input); - builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?; - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} - -/// Reads JSON Lines from input and write an obkv batch to writer. -pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - let reader = BufReader::new(input); - - for result in serde_json::Deserializer::from_reader(reader).into_iter() { - let object = result - .map_err(Error::Json) - .map_err(|e| (PayloadType::Ndjson, e))?; - builder - .append_json_object(&object) - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - } - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} - -/// Reads JSON from input and write an obkv batch to writer. -pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - let reader = BufReader::new(input); - - #[derive(Deserialize, Debug)] - #[serde(transparent)] - struct ArrayOrSingleObject { - #[serde(with = "either::serde_untagged")] - inner: Either, Object>, - } - - let content: ArrayOrSingleObject = serde_json::from_reader(reader) - .map_err(Error::Json) - .map_err(|e| (PayloadType::Json, e))?; - - for object in content.inner.map_right(|o| vec![o]).into_inner() { - builder - .append_json_object(&object) - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - } - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} diff --git a/dump/Cargo.toml b/dump/Cargo.toml index 199bc1c79..96f357397 100644 --- a/dump/Cargo.toml +++ b/dump/Cargo.toml @@ -6,7 +6,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -index = { path = "../index" } uuid = { version = "1.1.2", features = ["serde", "v4"] } serde_json = { version = "1.0.85", features = ["preserve_order"] } serde = { version = "1.0.136", features = ["derive"] } diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 2aa8f49e0..730f34b5c 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -11,10 +11,7 @@ bincode = "1.3.3" csv = "1.1.6" file-store = { path = "../file-store" } log = "0.4.14" -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } -index = { path = "../index" } meilisearch-types = { path = "../meilisearch-types" } -document-formats = { path = "../document-formats" } roaring = "0.9.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.85", features = ["preserve_order"] } diff --git a/index-scheduler/src/autobatcher.rs b/index-scheduler/src/autobatcher.rs index 2d054c41a..168e8e035 100644 --- a/index-scheduler/src/autobatcher.rs +++ b/index-scheduler/src/autobatcher.rs @@ -1,4 +1,6 @@ -use milli::update::IndexDocumentsMethod::{self, ReplaceDocuments, UpdateDocuments}; +use meilisearch_types::milli::update::IndexDocumentsMethod::{ + self, ReplaceDocuments, UpdateDocuments, +}; use std::ops::ControlFlow::{self, Break, Continue}; use crate::{task::Kind, TaskId}; diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index bd4d852f7..c3656a315 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -3,14 +3,19 @@ use crate::{ task::{Details, Kind, KindWithContent, Status, Task}, Error, IndexScheduler, Result, TaskId, }; -use index::apply_settings_to_builder; -use index::error::IndexError; -use index::{Settings, Unchecked}; use log::{debug, info}; -use milli::heed::{RoTxn, RwTxn}; -use milli::update::IndexDocumentsConfig; -use milli::update::{DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod}; -use milli::{documents::DocumentsBatchReader, BEU32}; +use meilisearch_types::milli::update::IndexDocumentsConfig; +use meilisearch_types::milli::update::{ + DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod, +}; +use meilisearch_types::milli::{ + self, documents::DocumentsBatchReader, update::Settings as MilliSettings, BEU32, +}; +use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; +use meilisearch_types::{ + heed::{RoTxn, RwTxn}, + Index, +}; use roaring::RoaringBitmap; use uuid::Uuid; @@ -527,7 +532,7 @@ impl IndexScheduler { if let Some(primary_key) = primary_key.clone() { let mut index_wtxn = index.write_txn()?; - let mut builder = milli::update::Settings::new( + let mut builder = MilliSettings::new( &mut index_wtxn, &index, self.index_mapper.indexer_config(), @@ -576,7 +581,7 @@ impl IndexScheduler { fn apply_index_operation<'txn, 'i>( &self, index_wtxn: &'txn mut RwTxn<'i, '_>, - index: &'i milli::Index, + index: &'i Index, operation: IndexOperation, ) -> Result> { match operation { @@ -639,7 +644,7 @@ impl IndexScheduler { for content_uuid in content_files.into_iter() { let content_file = self.file_store.get_update(content_uuid)?; let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(IndexError::from)?; + .map_err(milli::Error::from)?; let (new_builder, user_result) = builder.add_documents(reader)?; builder = new_builder; @@ -648,7 +653,7 @@ impl IndexScheduler { indexed_documents: count, number_of_documents: count, }), - Err(e) => Err(IndexError::from(e)), + Err(e) => Err(milli::Error::from(e)), }; results.push(user_result); diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index af1decfe0..880121f69 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -1,5 +1,6 @@ use meilisearch_types::error::{Code, ErrorCode}; -use milli::heed; +use meilisearch_types::heed; +use meilisearch_types::milli; use thiserror::Error; use crate::TaskId; @@ -26,8 +27,6 @@ pub enum Error { #[error(transparent)] Milli(#[from] milli::Error), #[error(transparent)] - IndexError(#[from] index::error::IndexError), - #[error(transparent)] FileStore(#[from] file_store::Error), #[error(transparent)] IoError(#[from] std::io::Error), @@ -48,7 +47,6 @@ impl ErrorCode for Error { // TODO: TAMO: are all these errors really internal? Error::Heed(_) => Code::Internal, Error::Milli(_) => Code::Internal, - Error::IndexError(_) => Code::Internal, Error::FileStore(_) => Code::Internal, Error::IoError(_) => Code::Internal, Error::Anyhow(_) => Code::Internal, diff --git a/index-scheduler/src/index_mapper.rs b/index-scheduler/src/index_mapper.rs index 063688f9f..608bf8e72 100644 --- a/index-scheduler/src/index_mapper.rs +++ b/index-scheduler/src/index_mapper.rs @@ -5,13 +5,12 @@ use std::sync::{Arc, RwLock}; use std::{fs, thread}; use log::error; -use milli::Index; +use meilisearch_types::heed::types::{SerdeBincode, Str}; +use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn}; +use meilisearch_types::milli::update::IndexerConfig; +use meilisearch_types::milli::Index; use uuid::Uuid; -use milli::heed::types::{SerdeBincode, Str}; -use milli::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn}; -use milli::update::IndexerConfig; - use self::IndexStatus::{Available, BeingDeleted}; use crate::{Error, Result}; @@ -70,7 +69,7 @@ impl IndexMapper { fs::create_dir_all(&index_path)?; let mut options = EnvOpenOptions::new(); options.map_size(self.index_size); - Ok(milli::Index::new(options, &index_path)?) + Ok(Index::new(options, &index_path)?) } error => error, } @@ -153,7 +152,7 @@ impl IndexMapper { fs::create_dir_all(&index_path)?; let mut options = EnvOpenOptions::new(); options.map_size(self.index_size); - let index = milli::Index::new(options, &index_path)?; + let index = Index::new(options, &index_path)?; entry.insert(Available(index.clone())); index } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 6aa100e10..2430bb090 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -7,8 +7,6 @@ mod snapshot; pub mod task; mod utils; -pub use milli; - pub type Result = std::result::Result; pub type TaskId = u32; @@ -26,10 +24,10 @@ use synchronoise::SignalEvent; use time::OffsetDateTime; use uuid::Uuid; -use milli::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str}; -use milli::heed::{self, Database, Env}; -use milli::update::IndexerConfig; -use milli::{Index, RoaringBitmapCodec, BEU32}; +use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str}; +use meilisearch_types::heed::{self, Database, Env}; +use meilisearch_types::milli::update::IndexerConfig; +use meilisearch_types::milli::{Index, RoaringBitmapCodec, BEU32}; use crate::index_mapper::IndexMapper; use crate::task::Task; @@ -452,7 +450,7 @@ impl IndexScheduler { mod tests { use big_s::S; use insta::*; - use milli::update::IndexDocumentsMethod::ReplaceDocuments; + use meilisearch_types::milli::update::IndexDocumentsMethod::ReplaceDocuments; use tempfile::TempDir; use uuid::Uuid; @@ -512,7 +510,8 @@ mod tests { .create_update_file_with_uuid(file_uuid) .unwrap(); let documents_count = - document_formats::read_json(content.as_bytes(), file.as_file_mut()).unwrap() as u64; + meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut()) + .unwrap() as u64; (file, documents_count) } @@ -779,7 +778,8 @@ mod tests { let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); let documents_count = - document_formats::read_json(content.as_bytes(), file.as_file_mut()).unwrap() as u64; + meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut()) + .unwrap() as u64; index_scheduler .register(KindWithContent::DocumentImport { index_uid: S("doggos"), diff --git a/index-scheduler/src/snapshot.rs b/index-scheduler/src/snapshot.rs index 790bf70a8..237df6f90 100644 --- a/index-scheduler/src/snapshot.rs +++ b/index-scheduler/src/snapshot.rs @@ -1,10 +1,8 @@ -use milli::{ - heed::{ - types::{OwnedType, SerdeBincode, SerdeJson, Str}, - Database, RoTxn, - }, - RoaringBitmapCodec, BEU32, +use meilisearch_types::heed::{ + types::{OwnedType, SerdeBincode, SerdeJson, Str}, + Database, RoTxn, }; +use meilisearch_types::milli::{RoaringBitmapCodec, BEU32}; use roaring::RoaringBitmap; use crate::{ diff --git a/index-scheduler/src/task.rs b/index-scheduler/src/task.rs index 4429d0e7e..aecb0b1b5 100644 --- a/index-scheduler/src/task.rs +++ b/index-scheduler/src/task.rs @@ -1,7 +1,7 @@ use anyhow::Result; -use index::{Settings, Unchecked}; use meilisearch_types::error::ResponseError; -use milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::settings::{Settings, Unchecked}; use serde::{Deserialize, Serialize, Serializer}; use std::{ @@ -543,7 +543,7 @@ fn serialize_duration( #[cfg(test)] mod tests { - use milli::heed::{types::SerdeJson, BytesDecode, BytesEncode}; + use meilisearch_types::heed::{types::SerdeJson, BytesDecode, BytesEncode}; use crate::assert_smol_debug_snapshot; diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 98b93ebfa..8a35ee387 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -1,9 +1,7 @@ //! Utility functions on the DBs. Mainly getter and setters. -use milli::{ - heed::{types::DecodeIgnore, RoTxn, RwTxn}, - BEU32, -}; +use meilisearch_types::heed::{types::DecodeIgnore, RoTxn, RwTxn}; +use meilisearch_types::milli::BEU32; use roaring::RoaringBitmap; use crate::{ diff --git a/index/Cargo.toml b/index/Cargo.toml deleted file mode 100644 index 008d25c28..000000000 --- a/index/Cargo.toml +++ /dev/null @@ -1,33 +0,0 @@ -[package] -name = "index" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -anyhow = "1.0.64" -bincode = "1.3.3" -csv = "1.1.6" -derivative = "2.2.0" -either = { version = "1.6.1", features = ["serde"] } -fst = "0.4.7" -indexmap = { version = "1.8.0", features = ["serde-1"] } -lazy_static = "1.4.0" -log = "0.4.14" -meilisearch-types = { path = "../meilisearch-types" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } -obkv = "0.2.0" -permissive-json-pointer = { path = "../permissive-json-pointer" } -regex = "1.5.5" -serde = { version = "1.0.136", features = ["derive"] } -serde_json = { version = "1.0.85", features = ["preserve_order"] } -thiserror = "1.0.30" -time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } -file-store = { path = "../file-store" } -uuid = { version = "1.1.2", features = ["serde", "v4"] } - -[dev-dependencies] -nelson = { git = "https://github.com/meilisearch/nelson.git", rev = "675f13885548fb415ead8fbb447e9e6d9314000a"} -proptest = "1.0.0" -proptest-derive = "0.3.0" diff --git a/index/src/dump.rs b/index/src/dump.rs deleted file mode 100644 index 6a41fa7a0..000000000 --- a/index/src/dump.rs +++ /dev/null @@ -1,160 +0,0 @@ -use std::fs::{create_dir_all, File}; -use std::io::{BufReader, Seek, SeekFrom, Write}; -use std::path::Path; - -use anyhow::Context; -use indexmap::IndexMap; -use milli::documents::DocumentsBatchReader; -use milli::heed::{EnvOpenOptions, RoTxn}; -use milli::update::{IndexDocumentsConfig, IndexerConfig}; -use serde::{Deserialize, Serialize}; - -use crate::document_formats::read_ndjson; -use crate::index::updates::apply_settings_to_builder; - -use super::error::Result; -use super::{index::Index, Settings, Unchecked}; - -#[derive(Serialize, Deserialize)] -struct DumpMeta { - settings: Settings, - primary_key: Option, -} - -const META_FILE_NAME: &str = "meta.json"; -const DATA_FILE_NAME: &str = "documents.jsonl"; - -impl Index { - pub fn dump(&self, path: impl AsRef) -> Result<()> { - // acquire write txn make sure any ongoing write is finished before we start. - let txn = self.write_txn()?; - let path = path.as_ref().join(format!("indexes/{}", self.uuid)); - - create_dir_all(&path)?; - - self.dump_documents(&txn, &path)?; - self.dump_meta(&txn, &path)?; - - Ok(()) - } - - fn dump_documents(&self, txn: &RoTxn, path: impl AsRef) -> Result<()> { - let document_file_path = path.as_ref().join(DATA_FILE_NAME); - let mut document_file = File::create(&document_file_path)?; - - let documents = self.all_documents(txn)?; - let fields_ids_map = self.fields_ids_map(txn)?; - - // dump documents - let mut json_map = IndexMap::new(); - for document in documents { - let (_, reader) = document?; - - for (fid, bytes) in reader.iter() { - if let Some(name) = fields_ids_map.name(fid) { - json_map.insert(name, serde_json::from_slice::(bytes)?); - } - } - - serde_json::to_writer(&mut document_file, &json_map)?; - document_file.write_all(b"\n")?; - - json_map.clear(); - } - - Ok(()) - } - - fn dump_meta(&self, txn: &RoTxn, path: impl AsRef) -> Result<()> { - let meta_file_path = path.as_ref().join(META_FILE_NAME); - let mut meta_file = File::create(&meta_file_path)?; - - let settings = self.settings_txn(txn)?.into_unchecked(); - let primary_key = self.primary_key(txn)?.map(String::from); - let meta = DumpMeta { - settings, - primary_key, - }; - - serde_json::to_writer(&mut meta_file, &meta)?; - - Ok(()) - } - - pub fn load_dump( - src: impl AsRef, - dst: impl AsRef, - size: usize, - indexer_config: &IndexerConfig, - ) -> anyhow::Result<()> { - let dir_name = src - .as_ref() - .file_name() - .with_context(|| format!("invalid dump index: {}", src.as_ref().display()))?; - - let dst_dir_path = dst.as_ref().join("indexes").join(dir_name); - create_dir_all(&dst_dir_path)?; - - let meta_path = src.as_ref().join(META_FILE_NAME); - let meta_file = File::open(meta_path)?; - let DumpMeta { - settings, - primary_key, - } = serde_json::from_reader(meta_file)?; - let settings = settings.check(); - - let mut options = EnvOpenOptions::new(); - options.map_size(size); - let index = milli::Index::new(options, &dst_dir_path)?; - - let mut txn = index.write_txn()?; - - // Apply settings first - let mut builder = milli::update::Settings::new(&mut txn, &index, indexer_config); - - if let Some(primary_key) = primary_key { - builder.set_primary_key(primary_key); - } - - apply_settings_to_builder(&settings, &mut builder); - - builder.execute(|_| ())?; - - let document_file_path = src.as_ref().join(DATA_FILE_NAME); - let reader = BufReader::new(File::open(&document_file_path)?); - - let mut tmp_doc_file = tempfile::tempfile()?; - - let empty = match read_ndjson(reader, &mut tmp_doc_file) { - // if there was no document in the file it's because the index was empty - Ok(0) => true, - Ok(_) => false, - Err(e) => return Err(e.into()), - }; - - if !empty { - tmp_doc_file.seek(SeekFrom::Start(0))?; - - let documents_reader = DocumentsBatchReader::from_reader(tmp_doc_file)?; - - //If the document file is empty, we don't perform the document addition, to prevent - //a primary key error to be thrown. - let config = IndexDocumentsConfig::default(); - let builder = milli::update::IndexDocuments::new( - &mut txn, - &index, - indexer_config, - config, - |_| (), - )?; - let (builder, user_error) = builder.add_documents(documents_reader)?; - user_error?; - builder.execute()?; - } - - txn.commit()?; - index.prepare_for_closing().wait(); - - Ok(()) - } -} diff --git a/index/src/error.rs b/index/src/error.rs deleted file mode 100644 index c960d6925..000000000 --- a/index/src/error.rs +++ /dev/null @@ -1,122 +0,0 @@ -use std::error::Error; -use std::fmt; - -use meilisearch_types::error::{Code, ErrorCode}; -use meilisearch_types::internal_error; -use milli::UserError; -use serde_json::Value; - -pub type Result = std::result::Result; - -#[derive(Debug, thiserror::Error)] -pub enum IndexError { - #[error("An internal error has occurred. `{0}`.")] - Internal(Box), - #[error("Document `{0}` not found.")] - DocumentNotFound(String), - #[error("{0}")] - Facet(#[from] FacetError), - #[error("{0}")] - Milli(#[from] milli::Error), -} - -internal_error!( - IndexError: std::io::Error, - milli::heed::Error, - fst::Error, - serde_json::Error, - file_store::Error, - milli::documents::Error -); - -impl ErrorCode for IndexError { - fn error_code(&self) -> Code { - match self { - IndexError::Internal(_) => Code::Internal, - IndexError::DocumentNotFound(_) => Code::DocumentNotFound, - IndexError::Facet(e) => e.error_code(), - IndexError::Milli(e) => MilliError(e).error_code(), - } - } -} - -impl ErrorCode for &IndexError { - fn error_code(&self) -> Code { - match self { - IndexError::Internal(_) => Code::Internal, - IndexError::DocumentNotFound(_) => Code::DocumentNotFound, - IndexError::Facet(e) => e.error_code(), - IndexError::Milli(e) => MilliError(e).error_code(), - } - } -} - -impl From for IndexError { - fn from(error: milli::UserError) -> IndexError { - IndexError::Milli(error.into()) - } -} - -#[derive(Debug, thiserror::Error)] -pub enum FacetError { - #[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))] - InvalidExpression(&'static [&'static str], Value), -} - -impl ErrorCode for FacetError { - fn error_code(&self) -> Code { - match self { - FacetError::InvalidExpression(_, _) => Code::Filter, - } - } -} - -#[derive(Debug)] -pub struct MilliError<'a>(pub &'a milli::Error); - -impl Error for MilliError<'_> {} - -impl fmt::Display for MilliError<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} - -impl ErrorCode for MilliError<'_> { - fn error_code(&self) -> Code { - match self.0 { - milli::Error::InternalError(_) => Code::Internal, - milli::Error::IoError(_) => Code::Internal, - milli::Error::UserError(ref error) => { - match error { - // TODO: wait for spec for new error codes. - UserError::SerdeJson(_) - | UserError::InvalidLmdbOpenOptions - | UserError::DocumentLimitReached - | UserError::AccessingSoftDeletedDocument { .. } - | UserError::UnknownInternalDocumentId { .. } => Code::Internal, - UserError::InvalidStoreFile => Code::InvalidStore, - UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, - UserError::MaxDatabaseSizeReached => Code::DatabaseSizeLimitReached, - UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded, - UserError::InvalidFilter(_) => Code::Filter, - UserError::MissingDocumentId { .. } => Code::MissingDocumentId, - UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => { - Code::InvalidDocumentId - } - UserError::MissingPrimaryKey => Code::MissingPrimaryKey, - UserError::PrimaryKeyCannotBeChanged(_) => Code::PrimaryKeyAlreadyPresent, - UserError::SortRankingRuleMissing => Code::Sort, - UserError::InvalidFacetsDistribution { .. } => Code::BadRequest, - UserError::InvalidSortableAttribute { .. } => Code::Sort, - UserError::CriterionError(_) => Code::InvalidRankingRule, - UserError::InvalidGeoField { .. } => Code::InvalidGeoField, - UserError::SortError(_) => Code::Sort, - UserError::InvalidMinTypoWordLenSetting(_, _) => { - Code::InvalidMinWordLengthForTypo - } - } - } - } - } -} diff --git a/index/src/search.rs b/index/src/search.rs deleted file mode 100644 index 4cd5647f3..000000000 --- a/index/src/search.rs +++ /dev/null @@ -1,869 +0,0 @@ -use std::cmp::min; -use std::collections::{BTreeMap, BTreeSet, HashSet}; -use std::marker::PhantomData; -use std::str::FromStr; -use std::time::Instant; - -use either::Either; -use fst::IntoStreamer; -use milli::heed::RoTxn; -use milli::tokenizer::TokenizerBuilder; -use milli::update::Setting; -use milli::{ - obkv_to_json, AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, - MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, -}; -use regex::Regex; -use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; - -use crate::error::FacetError; -use crate::updates::{FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, TypoSettings}; -use crate::{Checked, Settings}; - -use super::error::{IndexError, Result}; - -pub type Document = serde_json::Map; -type MatchesPosition = BTreeMap>; - -pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20; -pub const DEFAULT_CROP_LENGTH: fn() -> usize = || 10; -pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string(); -pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "".to_string(); -pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "".to_string(); - -/// The maximimum number of results that the engine -/// will be able to return in one search call. -pub const DEFAULT_PAGINATION_MAX_TOTAL_HITS: usize = 1000; - -#[derive(Deserialize, Debug, Clone, PartialEq, Eq)] -#[serde(rename_all = "camelCase", deny_unknown_fields)] -pub struct SearchQuery { - pub q: Option, - pub offset: Option, - #[serde(default = "DEFAULT_SEARCH_LIMIT")] - pub limit: usize, - pub attributes_to_retrieve: Option>, - pub attributes_to_crop: Option>, - #[serde(default = "DEFAULT_CROP_LENGTH")] - pub crop_length: usize, - pub attributes_to_highlight: Option>, - // Default to false - #[serde(default = "Default::default")] - pub show_matches_position: bool, - pub filter: Option, - pub sort: Option>, - pub facets: Option>, - #[serde(default = "DEFAULT_HIGHLIGHT_PRE_TAG")] - pub highlight_pre_tag: String, - #[serde(default = "DEFAULT_HIGHLIGHT_POST_TAG")] - pub highlight_post_tag: String, - #[serde(default = "DEFAULT_CROP_MARKER")] - pub crop_marker: String, - #[serde(default)] - pub matching_strategy: MatchingStrategy, -} - -#[derive(Deserialize, Debug, Clone, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub enum MatchingStrategy { - /// Remove query words from last to first - Last, - /// All query words are mandatory - All, -} - -impl Default for MatchingStrategy { - fn default() -> Self { - Self::Last - } -} - -impl From for TermsMatchingStrategy { - fn from(other: MatchingStrategy) -> Self { - match other { - MatchingStrategy::Last => Self::Last, - MatchingStrategy::All => Self::All, - } - } -} - -#[derive(Debug, Clone, Serialize, PartialEq)] -pub struct SearchHit { - #[serde(flatten)] - pub document: Document, - #[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")] - pub formatted: Document, - #[serde(rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] - pub matches_position: Option, -} - -#[derive(Serialize, Debug, Clone, PartialEq)] -#[serde(rename_all = "camelCase")] -pub struct SearchResult { - pub hits: Vec, - pub estimated_total_hits: u64, - pub query: String, - pub limit: usize, - pub offset: usize, - pub processing_time_ms: u128, - #[serde(skip_serializing_if = "Option::is_none")] - pub facet_distribution: Option>>, -} - -pub fn perform_search(index: &Index, query: SearchQuery) -> Result { - let before_search = Instant::now(); - let rtxn = index.read_txn()?; - - let mut search = index.search(&rtxn); - - if let Some(ref query) = query.q { - search.query(query); - } - - search.terms_matching_strategy(query.matching_strategy.into()); - - let max_total_hits = index - .pagination_max_total_hits(&rtxn)? - .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); - - // Make sure that a user can't get more documents than the hard limit, - // we align that on the offset too. - let offset = min(query.offset.unwrap_or(0), max_total_hits); - let limit = min(query.limit, max_total_hits.saturating_sub(offset)); - - search.offset(offset); - search.limit(limit); - - if let Some(ref filter) = query.filter { - if let Some(facets) = parse_filter(filter)? { - search.filter(facets); - } - } - - if let Some(ref sort) = query.sort { - let sort = match sort.iter().map(|s| AscDesc::from_str(s)).collect() { - Ok(sorts) => sorts, - Err(asc_desc_error) => { - return Err(IndexError::Milli(SortError::from(asc_desc_error).into())) - } - }; - - search.sort_criteria(sort); - } - - let milli::SearchResult { - documents_ids, - matching_words, - candidates, - .. - } = search.execute()?; - - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - - let displayed_ids = index - .displayed_fields_ids(&rtxn)? - .map(|fields| fields.into_iter().collect::>()) - .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); - - let fids = |attrs: &BTreeSet| { - let mut ids = BTreeSet::new(); - for attr in attrs { - if attr == "*" { - ids = displayed_ids.clone(); - break; - } - - if let Some(id) = fields_ids_map.id(attr) { - ids.insert(id); - } - } - ids - }; - - // The attributes to retrieve are the ones explicitly marked as to retrieve (all by default), - // but these attributes must be also be present - // - in the fields_ids_map - // - in the the displayed attributes - let to_retrieve_ids: BTreeSet<_> = query - .attributes_to_retrieve - .as_ref() - .map(fids) - .unwrap_or_else(|| displayed_ids.clone()) - .intersection(&displayed_ids) - .cloned() - .collect(); - - let attr_to_highlight = query.attributes_to_highlight.unwrap_or_default(); - - let attr_to_crop = query.attributes_to_crop.unwrap_or_default(); - - // Attributes in `formatted_options` correspond to the attributes that will be in `_formatted` - // These attributes are: - // - the attributes asked to be highlighted or cropped (with `attributesToCrop` or `attributesToHighlight`) - // - the attributes asked to be retrieved: these attributes will not be highlighted/cropped - // But these attributes must be also present in displayed attributes - let formatted_options = compute_formatted_options( - &attr_to_highlight, - &attr_to_crop, - query.crop_length, - &to_retrieve_ids, - &fields_ids_map, - &displayed_ids, - ); - - let tokenizer = TokenizerBuilder::default().build(); - - let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer); - formatter_builder.crop_marker(query.crop_marker); - formatter_builder.highlight_prefix(query.highlight_pre_tag); - formatter_builder.highlight_suffix(query.highlight_post_tag); - - let mut documents = Vec::new(); - - let documents_iter = index.documents(&rtxn, documents_ids)?; - - for (_id, obkv) in documents_iter { - // First generate a document with all the displayed fields - let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; - - // select the attributes to retrieve - let attributes_to_retrieve = to_retrieve_ids - .iter() - .map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); - let mut document = - permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); - - let (matches_position, formatted) = format_fields( - &displayed_document, - &fields_ids_map, - &formatter_builder, - &formatted_options, - query.show_matches_position, - &displayed_ids, - )?; - - if let Some(sort) = query.sort.as_ref() { - insert_geo_distance(sort, &mut document); - } - - let hit = SearchHit { - document, - formatted, - matches_position, - }; - documents.push(hit); - } - - let estimated_total_hits = candidates.len(); - - let facet_distribution = match query.facets { - Some(ref fields) => { - let mut facet_distribution = index.facets_distribution(&rtxn); - - let max_values_by_facet = index - .max_values_per_facet(&rtxn)? - .unwrap_or(DEFAULT_VALUES_PER_FACET); - facet_distribution.max_values_per_facet(max_values_by_facet); - - if fields.iter().all(|f| f != "*") { - facet_distribution.facets(fields); - } - let distribution = facet_distribution.candidates(candidates).execute()?; - - Some(distribution) - } - None => None, - }; - - let result = SearchResult { - hits: documents, - estimated_total_hits, - query: query.q.clone().unwrap_or_default(), - limit: query.limit, - offset: query.offset.unwrap_or_default(), - processing_time_ms: before_search.elapsed().as_millis(), - facet_distribution, - }; - Ok(result) -} - -pub fn all_documents<'a>( - index: &Index, - rtxn: &'a RoTxn, -) -> Result> + 'a> { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - - Ok(index.all_documents(rtxn)?.map(move |ret| { - ret.map_err(IndexError::from) - .and_then(|(_key, document)| -> Result<_> { - Ok(obkv_to_json(&all_fields, &fields_ids_map, document)?) - }) - })) -} - -pub fn retrieve_documents>( - index: &Index, - offset: usize, - limit: usize, - attributes_to_retrieve: Option>, -) -> Result<(u64, Vec)> { - let rtxn = index.read_txn()?; - - let mut documents = Vec::new(); - for document in all_documents(index, &rtxn)?.skip(offset).take(limit) { - let document = match &attributes_to_retrieve { - Some(attributes_to_retrieve) => permissive_json_pointer::select_values( - &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), - ), - None => document?, - }; - documents.push(document); - } - - let number_of_documents = index.number_of_documents(&rtxn)?; - Ok((number_of_documents, documents)) -} - -pub fn retrieve_document>( - index: &Index, - doc_id: &str, - attributes_to_retrieve: Option>, -) -> Result { - let txn = index.read_txn()?; - - let fields_ids_map = index.fields_ids_map(&txn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - - let internal_id = index - .external_documents_ids(&txn)? - .get(doc_id.as_bytes()) - .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?; - - let document = index - .documents(&txn, std::iter::once(internal_id))? - .into_iter() - .next() - .map(|(_, d)| d) - .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?; - - let document = obkv_to_json(&all_fields, &fields_ids_map, document)?; - let document = match &attributes_to_retrieve { - Some(attributes_to_retrieve) => permissive_json_pointer::select_values( - &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), - ), - None => document, - }; - - Ok(document) -} - -pub fn settings(index: &Index, rtxn: &RoTxn) -> Result> { - let displayed_attributes = index - .displayed_fields(rtxn)? - .map(|fields| fields.into_iter().map(String::from).collect()); - - let searchable_attributes = index - .user_defined_searchable_fields(rtxn)? - .map(|fields| fields.into_iter().map(String::from).collect()); - - let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect(); - - let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect(); - - let criteria = index - .criteria(rtxn)? - .into_iter() - .map(|c| c.to_string()) - .collect(); - - let stop_words = index - .stop_words(rtxn)? - .map(|stop_words| -> Result> { - Ok(stop_words.stream().into_strs()?.into_iter().collect()) - }) - .transpose()? - .unwrap_or_default(); - let distinct_field = index.distinct_field(rtxn)?.map(String::from); - - // in milli each word in the synonyms map were split on their separator. Since we lost - // this information we are going to put space between words. - let synonyms = index - .synonyms(rtxn)? - .iter() - .map(|(key, values)| { - ( - key.join(" "), - values.iter().map(|value| value.join(" ")).collect(), - ) - }) - .collect(); - - let min_typo_word_len = MinWordSizeTyposSetting { - one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?), - two_typos: Setting::Set(index.min_word_len_two_typos(rtxn)?), - }; - - let disabled_words = match index.exact_words(rtxn)? { - Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(), - None => BTreeSet::new(), - }; - - let disabled_attributes = index - .exact_attributes(rtxn)? - .into_iter() - .map(String::from) - .collect(); - - let typo_tolerance = TypoSettings { - enabled: Setting::Set(index.authorize_typos(rtxn)?), - min_word_size_for_typos: Setting::Set(min_typo_word_len), - disable_on_words: Setting::Set(disabled_words), - disable_on_attributes: Setting::Set(disabled_attributes), - }; - - let faceting = FacetingSettings { - max_values_per_facet: Setting::Set( - index - .max_values_per_facet(rtxn)? - .unwrap_or(DEFAULT_VALUES_PER_FACET), - ), - }; - - let pagination = PaginationSettings { - max_total_hits: Setting::Set( - index - .pagination_max_total_hits(rtxn)? - .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS), - ), - }; - - Ok(Settings { - displayed_attributes: match displayed_attributes { - Some(attrs) => Setting::Set(attrs), - None => Setting::Reset, - }, - searchable_attributes: match searchable_attributes { - Some(attrs) => Setting::Set(attrs), - None => Setting::Reset, - }, - filterable_attributes: Setting::Set(filterable_attributes), - sortable_attributes: Setting::Set(sortable_attributes), - ranking_rules: Setting::Set(criteria), - stop_words: Setting::Set(stop_words), - distinct_attribute: match distinct_field { - Some(field) => Setting::Set(field), - None => Setting::Reset, - }, - synonyms: Setting::Set(synonyms), - typo_tolerance: Setting::Set(typo_tolerance), - faceting: Setting::Set(faceting), - pagination: Setting::Set(pagination), - _kind: PhantomData, - }) -} - -fn insert_geo_distance(sorts: &[String], document: &mut Document) { - lazy_static::lazy_static! { - static ref GEO_REGEX: Regex = - Regex::new(r"_geoPoint\(\s*([[:digit:].\-]+)\s*,\s*([[:digit:].\-]+)\s*\)").unwrap(); - }; - if let Some(capture_group) = sorts.iter().find_map(|sort| GEO_REGEX.captures(sort)) { - // TODO: TAMO: milli encountered an internal error, what do we want to do? - let base = [ - capture_group[1].parse().unwrap(), - capture_group[2].parse().unwrap(), - ]; - let geo_point = &document.get("_geo").unwrap_or(&json!(null)); - if let Some((lat, lng)) = geo_point["lat"].as_f64().zip(geo_point["lng"].as_f64()) { - let distance = milli::distance_between_two_points(&base, &[lat, lng]); - document.insert("_geoDistance".to_string(), json!(distance.round() as usize)); - } - } -} - -fn compute_formatted_options( - attr_to_highlight: &HashSet, - attr_to_crop: &[String], - query_crop_length: usize, - to_retrieve_ids: &BTreeSet, - fields_ids_map: &FieldsIdsMap, - displayed_ids: &BTreeSet, -) -> BTreeMap { - let mut formatted_options = BTreeMap::new(); - - add_highlight_to_formatted_options( - &mut formatted_options, - attr_to_highlight, - fields_ids_map, - displayed_ids, - ); - - add_crop_to_formatted_options( - &mut formatted_options, - attr_to_crop, - query_crop_length, - fields_ids_map, - displayed_ids, - ); - - // Should not return `_formatted` if no valid attributes to highlight/crop - if !formatted_options.is_empty() { - add_non_formatted_ids_to_formatted_options(&mut formatted_options, to_retrieve_ids); - } - - formatted_options -} - -fn add_highlight_to_formatted_options( - formatted_options: &mut BTreeMap, - attr_to_highlight: &HashSet, - fields_ids_map: &FieldsIdsMap, - displayed_ids: &BTreeSet, -) { - for attr in attr_to_highlight { - let new_format = FormatOptions { - highlight: true, - crop: None, - }; - - if attr == "*" { - for id in displayed_ids { - formatted_options.insert(*id, new_format); - } - break; - } - - if let Some(id) = fields_ids_map.id(attr) { - if displayed_ids.contains(&id) { - formatted_options.insert(id, new_format); - } - } - } -} - -fn add_crop_to_formatted_options( - formatted_options: &mut BTreeMap, - attr_to_crop: &[String], - crop_length: usize, - fields_ids_map: &FieldsIdsMap, - displayed_ids: &BTreeSet, -) { - for attr in attr_to_crop { - let mut split = attr.rsplitn(2, ':'); - let (attr_name, attr_len) = match split.next().zip(split.next()) { - Some((len, name)) => { - let crop_len = len.parse::().unwrap_or(crop_length); - (name, crop_len) - } - None => (attr.as_str(), crop_length), - }; - - if attr_name == "*" { - for id in displayed_ids { - formatted_options - .entry(*id) - .and_modify(|f| f.crop = Some(attr_len)) - .or_insert(FormatOptions { - highlight: false, - crop: Some(attr_len), - }); - } - } - - if let Some(id) = fields_ids_map.id(attr_name) { - if displayed_ids.contains(&id) { - formatted_options - .entry(id) - .and_modify(|f| f.crop = Some(attr_len)) - .or_insert(FormatOptions { - highlight: false, - crop: Some(attr_len), - }); - } - } - } -} - -fn add_non_formatted_ids_to_formatted_options( - formatted_options: &mut BTreeMap, - to_retrieve_ids: &BTreeSet, -) { - for id in to_retrieve_ids { - formatted_options.entry(*id).or_insert(FormatOptions { - highlight: false, - crop: None, - }); - } -} - -fn make_document( - displayed_attributes: &BTreeSet, - field_ids_map: &FieldsIdsMap, - obkv: obkv::KvReaderU16, -) -> Result { - let mut document = serde_json::Map::new(); - - // recreate the original json - for (key, value) in obkv.iter() { - let value = serde_json::from_slice(value)?; - let key = field_ids_map - .name(key) - .expect("Missing field name") - .to_string(); - - document.insert(key, value); - } - - // select the attributes to retrieve - let displayed_attributes = displayed_attributes - .iter() - .map(|&fid| field_ids_map.name(fid).expect("Missing field name")); - - let document = permissive_json_pointer::select_values(&document, displayed_attributes); - Ok(document) -} - -fn format_fields<'a, A: AsRef<[u8]>>( - document: &Document, - field_ids_map: &FieldsIdsMap, - builder: &MatcherBuilder<'a, A>, - formatted_options: &BTreeMap, - compute_matches: bool, - displayable_ids: &BTreeSet, -) -> Result<(Option, Document)> { - let mut matches_position = compute_matches.then(BTreeMap::new); - let mut document = document.clone(); - - // select the attributes to retrieve - let displayable_names = displayable_ids - .iter() - .map(|&fid| field_ids_map.name(fid).expect("Missing field name")); - permissive_json_pointer::map_leaf_values(&mut document, displayable_names, |key, value| { - // To get the formatting option of each key we need to see all the rules that applies - // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` - // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only - // highlighted. - let format = formatted_options - .iter() - .filter(|(field, _option)| { - let name = field_ids_map.name(**field).unwrap(); - milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) - }) - .map(|(_, option)| *option) - .reduce(|acc, option| acc.merge(option)); - let mut infos = Vec::new(); - - *value = format_value( - std::mem::take(value), - builder, - format, - &mut infos, - compute_matches, - ); - - if let Some(matches) = matches_position.as_mut() { - if !infos.is_empty() { - matches.insert(key.to_owned(), infos); - } - } - }); - - let selectors = formatted_options - .keys() - // This unwrap must be safe since we got the ids from the fields_ids_map just - // before. - .map(|&fid| field_ids_map.name(fid).unwrap()); - let document = permissive_json_pointer::select_values(&document, selectors); - - Ok((matches_position, document)) -} - -fn format_value<'a, A: AsRef<[u8]>>( - value: Value, - builder: &MatcherBuilder<'a, A>, - format_options: Option, - infos: &mut Vec, - compute_matches: bool, -) -> Value { - match value { - Value::String(old_string) => { - let mut matcher = builder.build(&old_string); - if compute_matches { - let matches = matcher.matches(); - infos.extend_from_slice(&matches[..]); - } - - match format_options { - Some(format_options) => { - let value = matcher.format(format_options); - Value::String(value.into_owned()) - } - None => Value::String(old_string), - } - } - Value::Array(values) => Value::Array( - values - .into_iter() - .map(|v| { - format_value( - v, - builder, - format_options.map(|format_options| FormatOptions { - highlight: format_options.highlight, - crop: None, - }), - infos, - compute_matches, - ) - }) - .collect(), - ), - Value::Object(object) => Value::Object( - object - .into_iter() - .map(|(k, v)| { - ( - k, - format_value( - v, - builder, - format_options.map(|format_options| FormatOptions { - highlight: format_options.highlight, - crop: None, - }), - infos, - compute_matches, - ), - ) - }) - .collect(), - ), - Value::Number(number) => { - let s = number.to_string(); - - let mut matcher = builder.build(&s); - if compute_matches { - let matches = matcher.matches(); - infos.extend_from_slice(&matches[..]); - } - - match format_options { - Some(format_options) => { - let value = matcher.format(format_options); - Value::String(value.into_owned()) - } - None => Value::Number(number), - } - } - value => value, - } -} - -fn parse_filter(facets: &Value) -> Result> { - match facets { - Value::String(expr) => { - let condition = Filter::from_str(expr)?; - Ok(condition) - } - Value::Array(arr) => parse_filter_array(arr), - v => Err(FacetError::InvalidExpression(&["Array"], v.clone()).into()), - } -} - -fn parse_filter_array(arr: &[Value]) -> Result> { - let mut ands = Vec::new(); - for value in arr { - match value { - Value::String(s) => ands.push(Either::Right(s.as_str())), - Value::Array(arr) => { - let mut ors = Vec::new(); - for value in arr { - match value { - Value::String(s) => ors.push(s.as_str()), - v => { - return Err(FacetError::InvalidExpression(&["String"], v.clone()).into()) - } - } - } - ands.push(Either::Left(ors)); - } - v => { - return Err( - FacetError::InvalidExpression(&["String", "[String]"], v.clone()).into(), - ) - } - } - } - - Ok(Filter::from_array(ands)?) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_insert_geo_distance() { - let value: Document = serde_json::from_str( - r#"{ - "_geo": { - "lat": 50.629973371633746, - "lng": 3.0569447399419567 - }, - "city": "Lille", - "id": "1" - }"#, - ) - .unwrap(); - - let sorters = &["_geoPoint(50.629973371633746,3.0569447399419567):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = &["_geoPoint(50.629973371633746, 3.0569447399419567):asc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = - &["_geoPoint( 50.629973371633746 , 3.0569447399419567 ):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = &[ - "prix:asc", - "villeneuve:desc", - "_geoPoint(50.629973371633746, 3.0569447399419567):asc", - "ubu:asc", - ] - .map(|s| s.to_string()); - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - // only the first geoPoint is used to compute the distance - let sorters = &[ - "chien:desc", - "_geoPoint(50.629973371633746, 3.0569447399419567):asc", - "pangolin:desc", - "_geoPoint(100.0, -80.0):asc", - "chat:asc", - ] - .map(|s| s.to_string()); - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - // there was no _geoPoint so nothing is inserted in the document - let sorters = &["chien:asc".to_string()]; - let mut document = value; - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), None); - } -} diff --git a/index/src/updates.rs b/index/src/updates.rs deleted file mode 100644 index a6d13d99f..000000000 --- a/index/src/updates.rs +++ /dev/null @@ -1,429 +0,0 @@ -use std::collections::{BTreeMap, BTreeSet}; -use std::marker::PhantomData; -use std::num::NonZeroUsize; - -use milli::update::Setting; -use serde::{Deserialize, Serialize, Serializer}; - -fn serialize_with_wildcard( - field: &Setting>, - s: S, -) -> std::result::Result -where - S: Serializer, -{ - let wildcard = vec!["*".to_string()]; - match field { - Setting::Set(value) => Some(value), - Setting::Reset => Some(&wildcard), - Setting::NotSet => None, - } - .serialize(s) -} - -#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)] -pub struct Checked; - -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)] -pub struct Unchecked; - -#[cfg_attr(test, derive(proptest_derive::Arbitrary))] -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -pub struct MinWordSizeTyposSetting { - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub one_typo: Setting, - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub two_typos: Setting, -} - -#[cfg_attr(test, derive(proptest_derive::Arbitrary))] -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -pub struct TypoSettings { - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub enabled: Setting, - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub min_word_size_for_typos: Setting, - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub disable_on_words: Setting>, - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub disable_on_attributes: Setting>, -} - -#[cfg_attr(test, derive(proptest_derive::Arbitrary))] -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -pub struct FacetingSettings { - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub max_values_per_facet: Setting, -} - -#[cfg_attr(test, derive(proptest_derive::Arbitrary))] -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -pub struct PaginationSettings { - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - pub max_total_hits: Setting, -} - -/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings -/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a -/// call to `check` will return a `Settings` from a `Settings`. -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))] -#[cfg_attr(test, derive(proptest_derive::Arbitrary))] -pub struct Settings { - #[serde( - default, - serialize_with = "serialize_with_wildcard", - skip_serializing_if = "Setting::is_not_set" - )] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub displayed_attributes: Setting>, - - #[serde( - default, - serialize_with = "serialize_with_wildcard", - skip_serializing_if = "Setting::is_not_set" - )] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub searchable_attributes: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub filterable_attributes: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub sortable_attributes: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub ranking_rules: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub stop_words: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub synonyms: Setting>>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub distinct_attribute: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub typo_tolerance: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub faceting: Setting, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] - pub pagination: Setting, - - #[serde(skip)] - pub _kind: PhantomData, -} - -impl Settings { - pub fn cleared() -> Settings { - Settings { - displayed_attributes: Setting::Reset, - searchable_attributes: Setting::Reset, - filterable_attributes: Setting::Reset, - sortable_attributes: Setting::Reset, - ranking_rules: Setting::Reset, - stop_words: Setting::Reset, - synonyms: Setting::Reset, - distinct_attribute: Setting::Reset, - typo_tolerance: Setting::Reset, - faceting: Setting::Reset, - pagination: Setting::Reset, - _kind: PhantomData, - } - } - - pub fn into_unchecked(self) -> Settings { - let Self { - displayed_attributes, - searchable_attributes, - filterable_attributes, - sortable_attributes, - ranking_rules, - stop_words, - synonyms, - distinct_attribute, - typo_tolerance, - faceting, - pagination, - .. - } = self; - - Settings { - displayed_attributes, - searchable_attributes, - filterable_attributes, - sortable_attributes, - ranking_rules, - stop_words, - synonyms, - distinct_attribute, - typo_tolerance, - faceting, - pagination, - _kind: PhantomData, - } - } -} - -impl Settings { - pub fn check(self) -> Settings { - let displayed_attributes = match self.displayed_attributes { - Setting::Set(fields) => { - if fields.iter().any(|f| f == "*") { - Setting::Reset - } else { - Setting::Set(fields) - } - } - otherwise => otherwise, - }; - - let searchable_attributes = match self.searchable_attributes { - Setting::Set(fields) => { - if fields.iter().any(|f| f == "*") { - Setting::Reset - } else { - Setting::Set(fields) - } - } - otherwise => otherwise, - }; - - Settings { - displayed_attributes, - searchable_attributes, - filterable_attributes: self.filterable_attributes, - sortable_attributes: self.sortable_attributes, - ranking_rules: self.ranking_rules, - stop_words: self.stop_words, - synonyms: self.synonyms, - distinct_attribute: self.distinct_attribute, - typo_tolerance: self.typo_tolerance, - faceting: self.faceting, - pagination: self.pagination, - _kind: PhantomData, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -pub struct Facets { - pub level_group_size: Option, - pub min_level_size: Option, -} - -pub fn apply_settings_to_builder( - settings: &Settings, - builder: &mut milli::update::Settings, -) { - match settings.searchable_attributes { - Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), - Setting::Reset => builder.reset_searchable_fields(), - Setting::NotSet => (), - } - - match settings.displayed_attributes { - Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), - Setting::Reset => builder.reset_displayed_fields(), - Setting::NotSet => (), - } - - match settings.filterable_attributes { - Setting::Set(ref facets) => { - builder.set_filterable_fields(facets.clone().into_iter().collect()) - } - Setting::Reset => builder.reset_filterable_fields(), - Setting::NotSet => (), - } - - match settings.sortable_attributes { - Setting::Set(ref fields) => builder.set_sortable_fields(fields.iter().cloned().collect()), - Setting::Reset => builder.reset_sortable_fields(), - Setting::NotSet => (), - } - - match settings.ranking_rules { - Setting::Set(ref criteria) => builder.set_criteria(criteria.clone()), - Setting::Reset => builder.reset_criteria(), - Setting::NotSet => (), - } - - match settings.stop_words { - Setting::Set(ref stop_words) => builder.set_stop_words(stop_words.clone()), - Setting::Reset => builder.reset_stop_words(), - Setting::NotSet => (), - } - - match settings.synonyms { - Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()), - Setting::Reset => builder.reset_synonyms(), - Setting::NotSet => (), - } - - match settings.distinct_attribute { - Setting::Set(ref attr) => builder.set_distinct_field(attr.clone()), - Setting::Reset => builder.reset_distinct_field(), - Setting::NotSet => (), - } - - match settings.typo_tolerance { - Setting::Set(ref value) => { - match value.enabled { - Setting::Set(val) => builder.set_autorize_typos(val), - Setting::Reset => builder.reset_authorize_typos(), - Setting::NotSet => (), - } - - match value.min_word_size_for_typos { - Setting::Set(ref setting) => { - match setting.one_typo { - Setting::Set(val) => builder.set_min_word_len_one_typo(val), - Setting::Reset => builder.reset_min_word_len_one_typo(), - Setting::NotSet => (), - } - match setting.two_typos { - Setting::Set(val) => builder.set_min_word_len_two_typos(val), - Setting::Reset => builder.reset_min_word_len_two_typos(), - Setting::NotSet => (), - } - } - Setting::Reset => { - builder.reset_min_word_len_one_typo(); - builder.reset_min_word_len_two_typos(); - } - Setting::NotSet => (), - } - - match value.disable_on_words { - Setting::Set(ref words) => { - builder.set_exact_words(words.clone()); - } - Setting::Reset => builder.reset_exact_words(), - Setting::NotSet => (), - } - - match value.disable_on_attributes { - Setting::Set(ref words) => { - builder.set_exact_attributes(words.iter().cloned().collect()) - } - Setting::Reset => builder.reset_exact_attributes(), - Setting::NotSet => (), - } - } - Setting::Reset => { - // all typo settings need to be reset here. - builder.reset_authorize_typos(); - builder.reset_min_word_len_one_typo(); - builder.reset_min_word_len_two_typos(); - builder.reset_exact_words(); - builder.reset_exact_attributes(); - } - Setting::NotSet => (), - } - - match settings.faceting { - Setting::Set(ref value) => match value.max_values_per_facet { - Setting::Set(val) => builder.set_max_values_per_facet(val), - Setting::Reset => builder.reset_max_values_per_facet(), - Setting::NotSet => (), - }, - Setting::Reset => builder.reset_max_values_per_facet(), - Setting::NotSet => (), - } - - match settings.pagination { - Setting::Set(ref value) => match value.max_total_hits { - Setting::Set(val) => builder.set_pagination_max_total_hits(val), - Setting::Reset => builder.reset_pagination_max_total_hits(), - Setting::NotSet => (), - }, - Setting::Reset => builder.reset_pagination_max_total_hits(), - Setting::NotSet => (), - } -} - -#[cfg(test)] -pub(crate) mod test { - use proptest::prelude::*; - - use super::*; - - pub(super) fn setting_strategy() -> impl Strategy> { - prop_oneof![ - Just(Setting::NotSet), - Just(Setting::Reset), - any::().prop_map(Setting::Set) - ] - } - - #[test] - fn test_setting_check() { - // test no changes - let settings = Settings { - displayed_attributes: Setting::Set(vec![String::from("hello")]), - searchable_attributes: Setting::Set(vec![String::from("hello")]), - filterable_attributes: Setting::NotSet, - sortable_attributes: Setting::NotSet, - ranking_rules: Setting::NotSet, - stop_words: Setting::NotSet, - synonyms: Setting::NotSet, - distinct_attribute: Setting::NotSet, - typo_tolerance: Setting::NotSet, - faceting: Setting::NotSet, - pagination: Setting::NotSet, - _kind: PhantomData::, - }; - - let checked = settings.clone().check(); - assert_eq!(settings.displayed_attributes, checked.displayed_attributes); - assert_eq!( - settings.searchable_attributes, - checked.searchable_attributes - ); - - // test wildcard - // test no changes - let settings = Settings { - displayed_attributes: Setting::Set(vec![String::from("*")]), - searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]), - filterable_attributes: Setting::NotSet, - sortable_attributes: Setting::NotSet, - ranking_rules: Setting::NotSet, - stop_words: Setting::NotSet, - synonyms: Setting::NotSet, - distinct_attribute: Setting::NotSet, - typo_tolerance: Setting::NotSet, - faceting: Setting::NotSet, - pagination: Setting::NotSet, - _kind: PhantomData::, - }; - - let checked = settings.check(); - assert_eq!(checked.displayed_attributes, Setting::Reset); - assert_eq!(checked.searchable_attributes, Setting::Reset); - } -} diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index 44899a19b..c69dfc6dc 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -47,17 +47,15 @@ jsonwebtoken = "8.1.1" log = "0.4.17" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } -meilisearch-lib = { path = "../meilisearch-lib", default-features = false } -index = { path = "../index" } index-scheduler = { path = "../index-scheduler" } file-store = { path = "../file-store" } -document-formats = { path = "../document-formats" } mimalloc = { version = "0.1.29", default-features = false } mime = "0.3.16" num_cpus = "1.13.1" obkv = "0.2.0" once_cell = "1.15.0" parking_lot = "0.12.1" +permissive-json-pointer = { path = "../permissive-json-pointer" } pin-project-lite = "0.2.9" platform-dirs = "0.3.0" rand = "0.8.5" @@ -98,7 +96,7 @@ yaup = "0.2.1" temp-env = "0.3.1" [features] -default = ["analytics", "meilisearch-lib/default", "mini-dashboard"] +default = ["analytics", "meilisearch-types/default", "mini-dashboard"] metrics = ["prometheus"] analytics = ["segment"] mini-dashboard = [ @@ -112,10 +110,10 @@ mini-dashboard = [ "tempfile", "zip", ] -chinese = ["meilisearch-lib/chinese"] -hebrew = ["meilisearch-lib/hebrew"] -japanese = ["meilisearch-lib/japanese"] -thai = ["meilisearch-lib/thai"] +chinese = ["meilisearch-types/chinese"] +hebrew = ["meilisearch-types/hebrew"] +japanese = ["meilisearch-types/japanese"] +thai = ["meilisearch-types/thai"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.2/build.zip" diff --git a/meilisearch-http/src/error.rs b/meilisearch-http/src/error.rs index 22ffe2d36..6bd2519a6 100644 --- a/meilisearch-http/src/error.rs +++ b/meilisearch-http/src/error.rs @@ -1,7 +1,8 @@ use actix_web as aweb; use aweb::error::{JsonPayloadError, QueryPayloadError}; -use document_formats::DocumentFormatError; +use meilisearch_types::document_formats::DocumentFormatError; use meilisearch_types::error::{Code, ErrorCode, ResponseError}; +use serde_json::Value; use tokio::task::JoinError; #[derive(Debug, thiserror::Error)] @@ -14,9 +15,19 @@ pub enum MeilisearchHttpError { .1.iter().map(|s| format!("`{}`", s)).collect::>().join(", ") )] InvalidContentType(String, Vec), + #[error("Document `{0}` not found.")] + DocumentNotFound(String), + #[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))] + InvalidExpression(&'static [&'static str], Value), + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + #[error(transparent)] + HeedError(#[from] meilisearch_types::heed::Error), #[error(transparent)] IndexScheduler(#[from] index_scheduler::Error), #[error(transparent)] + Milli(#[from] meilisearch_types::milli::Error), + #[error(transparent)] Payload(#[from] PayloadError), #[error(transparent)] FileStore(#[from] file_store::Error), @@ -31,7 +42,12 @@ impl ErrorCode for MeilisearchHttpError { match self { MeilisearchHttpError::MissingContentType(_) => Code::MissingContentType, MeilisearchHttpError::InvalidContentType(_, _) => Code::InvalidContentType, + MeilisearchHttpError::DocumentNotFound(_) => Code::DocumentNotFound, + MeilisearchHttpError::InvalidExpression(_, _) => Code::Filter, + MeilisearchHttpError::SerdeJson(_) => Code::Internal, + MeilisearchHttpError::HeedError(_) => Code::Internal, MeilisearchHttpError::IndexScheduler(e) => e.error_code(), + MeilisearchHttpError::Milli(e) => e.error_code(), MeilisearchHttpError::Payload(e) => e.error_code(), MeilisearchHttpError::FileStore(_) => Code::Internal, MeilisearchHttpError::DocumentFormat(e) => e.error_code(), diff --git a/meilisearch-http/src/lib.rs b/meilisearch-http/src/lib.rs index 1763119e1..42b2e88e5 100644 --- a/meilisearch-http/src/lib.rs +++ b/meilisearch-http/src/lib.rs @@ -6,6 +6,7 @@ pub mod analytics; pub mod extractors; pub mod option; pub mod routes; +pub mod search; #[cfg(feature = "metrics")] pub mod metrics; @@ -38,6 +39,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result { opt.db_path.join("indexes"), opt.max_index_size.get_bytes() as usize, (&opt.indexer_options).try_into()?, + true, #[cfg(test)] todo!("We'll see later"), )?; @@ -45,8 +47,6 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result { /* TODO: We should start a thread to handle the snapshots. meilisearch - .set_max_index_size(opt.max_index_size.get_bytes() as usize) - .set_max_task_store_size(opt.max_task_db_size.get_bytes() as usize) // snapshot .set_ignore_missing_snapshot(opt.ignore_missing_snapshot) .set_ignore_snapshot_if_db_exists(opt.ignore_snapshot_if_db_exists) diff --git a/meilisearch-http/src/option.rs b/meilisearch-http/src/option.rs index 962094dbb..4c2deb26f 100644 --- a/meilisearch-http/src/option.rs +++ b/meilisearch-http/src/option.rs @@ -11,11 +11,7 @@ use std::{fmt, fs}; use byte_unit::{Byte, ByteError}; use clap::Parser; -use meilisearch_lib::{ - export_to_env_if_not_present, - options::{IndexerOpts, SchedulerConfig}, -}; -use index_scheduler::milli::update::IndexerConfig; +use meilisearch_types::milli::update::IndexerConfig; use rustls::{ server::{ AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, diff --git a/meilisearch-http/src/routes/indexes/documents.rs b/meilisearch-http/src/routes/indexes/documents.rs index 1f68245c0..d036b719a 100644 --- a/meilisearch-http/src/routes/indexes/documents.rs +++ b/meilisearch-http/src/routes/indexes/documents.rs @@ -1,26 +1,24 @@ use std::io::Cursor; -use actix_web::error::PayloadError; use actix_web::http::header::CONTENT_TYPE; -use actix_web::web::{Bytes, Data}; +use actix_web::web::Data; use actix_web::HttpMessage; use actix_web::{web, HttpRequest, HttpResponse}; use bstr::ByteSlice; -use document_formats::{read_csv, read_json, read_ndjson, PayloadType}; -use futures::{Stream, StreamExt}; -use index::{retrieve_document, retrieve_documents}; -use index_scheduler::milli::update::IndexDocumentsMethod; -use index_scheduler::IndexScheduler; -use index_scheduler::{KindWithContent, TaskView}; +use futures::StreamExt; +use index_scheduler::{IndexScheduler, KindWithContent, TaskView}; use log::debug; +use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType}; use meilisearch_types::error::ResponseError; +use meilisearch_types::heed::RoTxn; +use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::star_or::StarOr; +use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; use serde::Deserialize; use serde_cs::vec::CS; use serde_json::Value; -use tokio::sync::mpsc; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; @@ -37,17 +35,6 @@ static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { ] }); -/// This is required because Payload is not Sync nor Send -fn payload_to_stream(mut payload: Payload) -> impl Stream> { - let (snd, recv) = mpsc::channel(1); - tokio::task::spawn_local(async move { - while let Some(data) = payload.next().await { - let _ = snd.send(data).await; - } - }); - tokio_stream::wrappers::ReceiverStream::new(recv) -} - /// Extracts the mime type from the content type and return /// a meilisearch error if anything bad happen. fn extract_mime_type(req: &HttpRequest) -> Result, MeilisearchHttpError> { @@ -344,3 +331,76 @@ pub async fn clear_all_documents( debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } + +fn all_documents<'a>( + index: &Index, + rtxn: &'a RoTxn, +) -> Result> + 'a, ResponseError> { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + + Ok(index.all_documents(rtxn)?.map(move |ret| { + ret.map_err(ResponseError::from) + .and_then(|(_key, document)| -> Result<_, ResponseError> { + Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) + }) + })) +} + +fn retrieve_documents>( + index: &Index, + offset: usize, + limit: usize, + attributes_to_retrieve: Option>, +) -> Result<(u64, Vec), ResponseError> { + let rtxn = index.read_txn()?; + + let mut documents = Vec::new(); + for document in all_documents(index, &rtxn)?.skip(offset).take(limit) { + let document = match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document?, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document?, + }; + documents.push(document); + } + + let number_of_documents = index.number_of_documents(&rtxn)?; + Ok((number_of_documents, documents)) +} + +fn retrieve_document>( + index: &Index, + doc_id: &str, + attributes_to_retrieve: Option>, +) -> Result { + let txn = index.read_txn()?; + + let fields_ids_map = index.fields_ids_map(&txn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + + let internal_id = index + .external_documents_ids(&txn)? + .get(doc_id.as_bytes()) + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + + let document = index + .documents(&txn, std::iter::once(internal_id))? + .into_iter() + .next() + .map(|(_, d)| d) + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + + let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; + let document = match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document, + }; + + Ok(document) +} diff --git a/meilisearch-http/src/routes/indexes/mod.rs b/meilisearch-http/src/routes/indexes/mod.rs index c120d1e00..6fd2066cf 100644 --- a/meilisearch-http/src/routes/indexes/mod.rs +++ b/meilisearch-http/src/routes/indexes/mod.rs @@ -1,9 +1,9 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; -use index_scheduler::milli::{FieldDistribution, Index}; use index_scheduler::{IndexScheduler, KindWithContent, Query, Status}; use log::debug; use meilisearch_types::error::ResponseError; +use meilisearch_types::milli::{self, FieldDistribution, Index}; use serde::{Deserialize, Serialize}; use serde_json::json; use time::OffsetDateTime; @@ -51,7 +51,7 @@ pub struct IndexView { } impl IndexView { - fn new(uid: String, index: &Index) -> Result { + fn new(uid: String, index: &Index) -> Result { let rtxn = index.read_txn()?; Ok(IndexView { uid, diff --git a/meilisearch-http/src/routes/indexes/search.rs b/meilisearch-http/src/routes/indexes/search.rs index f19ebdaee..374aa3c89 100644 --- a/meilisearch-http/src/routes/indexes/search.rs +++ b/meilisearch-http/src/routes/indexes/search.rs @@ -1,9 +1,5 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; -use index::{ - perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, -}; use index_scheduler::IndexScheduler; use log::debug; use meilisearch_auth::IndexSearchRules; @@ -15,6 +11,10 @@ use serde_json::Value; use crate::analytics::{Analytics, SearchAggregator}; use crate::extractors::authentication::{policies::*, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::search::{ + perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, +}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( diff --git a/meilisearch-http/src/routes/indexes/settings.rs b/meilisearch-http/src/routes/indexes/settings.rs index b11a863bc..c74c0dbcc 100644 --- a/meilisearch-http/src/routes/indexes/settings.rs +++ b/meilisearch-http/src/routes/indexes/settings.rs @@ -1,14 +1,26 @@ +use std::collections::BTreeSet; +use std::marker::PhantomData; + use actix_web::web::Data; +use fst::IntoStreamer; use log::debug; use actix_web::{web, HttpRequest, HttpResponse}; -use index::{Settings, Unchecked}; use index_scheduler::{IndexScheduler, KindWithContent}; use meilisearch_types::error::ResponseError; +use meilisearch_types::heed::RoTxn; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::{self, DEFAULT_VALUES_PER_FACET}; +use meilisearch_types::settings::{ + Checked, FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, Settings, TypoSettings, + Unchecked, +}; +use meilisearch_types::Index; use serde_json::json; use crate::analytics::Analytics; use crate::extractors::authentication::{policies::*, GuardedData}; +use crate::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS; #[macro_export] macro_rules! make_setting_route { @@ -18,14 +30,15 @@ macro_rules! make_setting_route { use actix_web::{web, HttpRequest, HttpResponse, Resource}; use log::debug; - use index::Settings; - use index_scheduler::milli::update::Setting; use index_scheduler::{IndexScheduler, KindWithContent}; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::settings::Settings; use meilisearch_types::error::ResponseError; use $crate::analytics::Analytics; use $crate::extractors::authentication::{policies::*, GuardedData}; use $crate::extractors::sequential_extractor::SeqHandler; + use $crate::routes::indexes::settings::settings; pub async fn delete( index_scheduler: GuardedData< @@ -98,7 +111,7 @@ macro_rules! make_setting_route { ) -> std::result::Result { let index = index_scheduler.index(&index_uid)?; let rtxn = index.read_txn()?; - let settings = index::settings(&index, &rtxn)?; + let settings = settings(&index, &rtxn)?; debug!("returns: {:?}", settings); let mut json = serde_json::json!(&settings); @@ -185,11 +198,11 @@ make_setting_route!( make_setting_route!( "/typo-tolerance", patch, - index::updates::TypoSettings, + meilisearch_types::settings::TypoSettings, typo_tolerance, "typoTolerance", analytics, - |setting: &Option, req: &HttpRequest| { + |setting: &Option, req: &HttpRequest| { use serde_json::json; analytics.publish( @@ -295,11 +308,11 @@ make_setting_route!( make_setting_route!( "/faceting", patch, - index::updates::FacetingSettings, + meilisearch_types::settings::FacetingSettings, faceting, "faceting", analytics, - |setting: &Option, req: &HttpRequest| { + |setting: &Option, req: &HttpRequest| { use serde_json::json; analytics.publish( @@ -317,11 +330,11 @@ make_setting_route!( make_setting_route!( "/pagination", patch, - index::updates::PaginationSettings, + meilisearch_types::settings::PaginationSettings, pagination, "pagination", analytics, - |setting: &Option, req: &HttpRequest| { + |setting: &Option, req: &HttpRequest| { use serde_json::json; analytics.publish( @@ -456,7 +469,7 @@ pub async fn get_all( ) -> Result { let index = index_scheduler.index(&index_uid)?; let rtxn = index.read_txn()?; - let new_settings = index::settings(&index, &rtxn)?; + let new_settings = settings(&index, &rtxn)?; debug!("returns: {:?}", new_settings); Ok(HttpResponse::Ok().json(new_settings)) } @@ -479,3 +492,108 @@ pub async fn delete_all( debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } + +pub fn settings(index: &Index, rtxn: &RoTxn) -> Result, milli::Error> { + let displayed_attributes = index + .displayed_fields(rtxn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect(); + + let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect(); + + let criteria = index + .criteria(rtxn)? + .into_iter() + .map(|c| c.to_string()) + .collect(); + + let stop_words = index + .stop_words(rtxn)? + .map(|stop_words| -> Result, milli::Error> { + Ok(stop_words.stream().into_strs()?.into_iter().collect()) + }) + .transpose()? + .unwrap_or_default(); + let distinct_field = index.distinct_field(rtxn)?.map(String::from); + + // in milli each word in the synonyms map were split on their separator. Since we lost + // this information we are going to put space between words. + let synonyms = index + .synonyms(rtxn)? + .iter() + .map(|(key, values)| { + ( + key.join(" "), + values.iter().map(|value| value.join(" ")).collect(), + ) + }) + .collect(); + + let min_typo_word_len = MinWordSizeTyposSetting { + one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?), + two_typos: Setting::Set(index.min_word_len_two_typos(rtxn)?), + }; + + let disabled_words = match index.exact_words(rtxn)? { + Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(), + None => BTreeSet::new(), + }; + + let disabled_attributes = index + .exact_attributes(rtxn)? + .into_iter() + .map(String::from) + .collect(); + + let typo_tolerance = TypoSettings { + enabled: Setting::Set(index.authorize_typos(rtxn)?), + min_word_size_for_typos: Setting::Set(min_typo_word_len), + disable_on_words: Setting::Set(disabled_words), + disable_on_attributes: Setting::Set(disabled_attributes), + }; + + let faceting = FacetingSettings { + max_values_per_facet: Setting::Set( + index + .max_values_per_facet(rtxn)? + .unwrap_or(DEFAULT_VALUES_PER_FACET), + ), + }; + + let pagination = PaginationSettings { + max_total_hits: Setting::Set( + index + .pagination_max_total_hits(rtxn)? + .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS), + ), + }; + + Ok(Settings { + displayed_attributes: match displayed_attributes { + Some(attrs) => Setting::Set(attrs), + None => Setting::Reset, + }, + searchable_attributes: match searchable_attributes { + Some(attrs) => Setting::Set(attrs), + None => Setting::Reset, + }, + filterable_attributes: Setting::Set(filterable_attributes), + sortable_attributes: Setting::Set(sortable_attributes), + ranking_rules: Setting::Set(criteria), + stop_words: Setting::Set(stop_words), + distinct_attribute: match distinct_field { + Some(field) => Setting::Set(field), + None => Setting::Reset, + }, + synonyms: Setting::Set(synonyms), + typo_tolerance: Setting::Set(typo_tolerance), + faceting: Setting::Set(faceting), + pagination: Setting::Set(pagination), + _kind: PhantomData, + }) +} diff --git a/meilisearch-http/src/routes/mod.rs b/meilisearch-http/src/routes/mod.rs index 286225d7a..b47c0f0cb 100644 --- a/meilisearch-http/src/routes/mod.rs +++ b/meilisearch-http/src/routes/mod.rs @@ -2,10 +2,10 @@ use std::collections::BTreeMap; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; -use index::{Settings, Unchecked}; use index_scheduler::{IndexScheduler, Query, Status}; use log::debug; use meilisearch_types::error::ResponseError; +use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::star_or::StarOr; use serde::{Deserialize, Serialize}; use serde_json::json; diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index cf5ad5ed2..ea6710c5e 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -6,12 +6,24 @@ edition = "2021" [dependencies] actix-web = { version = "4.2.1", default-features = false } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } +csv = "1.1.6" +either = { version = "1.6.1", features = ["serde"] } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.4", default-features = false } proptest = { version = "1.0.0", optional = true } proptest-derive = { version = "0.3.0", optional = true } serde = { version = "1.0.145", features = ["derive"] } serde_json = "1.0.85" tokio = "1.0" +[dev-dependencies] +proptest = "1.0.0" +proptest-derive = "0.3.0" + [features] +default = ["milli/default"] + test-traits = ["proptest", "proptest-derive"] +chinese = ["milli/chinese"] +hebrew = ["milli/hebrew"] +japanese = ["milli/japanese"] +thai = ["milli/thai"] diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index 2d685c2dc..1d2ba0ffd 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -1,3 +1,11 @@ +pub mod document_formats; pub mod error; pub mod index_uid; +pub mod settings; pub mod star_or; + +pub use milli; +pub use milli::heed; +pub use milli::Index; + +pub type Document = serde_json::Map;