From 94e29a9f5f7be8ccb2587d2b533d60b23e1910fd Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 13 Sep 2022 20:04:48 +0200 Subject: [PATCH] extract the index abstraction out of the index-scheduler in its own module --- Cargo.lock | 25 ++- Cargo.toml | 1 + index-scheduler/Cargo.toml | 17 +- index-scheduler/src/document_formats.rs | 155 ------------------ index-scheduler/src/error.rs | 2 - index-scheduler/src/lib.rs | 2 - index/Cargo.toml | 33 ++++ .../src/index => index/src}/dump.rs | 0 .../src/index => index/src}/error.rs | 0 .../src/index => index/src}/index.rs | 2 +- .../src/index/mod.rs => index/src/lib.rs | 8 +- index/src/main.rs | 3 + .../src/index => index/src}/search.rs | 2 +- .../src/index => index/src}/updates.rs | 0 14 files changed, 64 insertions(+), 186 deletions(-) delete mode 100644 index-scheduler/src/document_formats.rs create mode 100644 index/Cargo.toml rename {index-scheduler/src/index => index/src}/dump.rs (100%) rename {index-scheduler/src/index => index/src}/error.rs (100%) rename {index-scheduler/src/index => index/src}/index.rs (99%) rename index-scheduler/src/index/mod.rs => index/src/lib.rs (97%) create mode 100644 index/src/main.rs rename {index-scheduler/src/index => index/src}/search.rs (99%) rename {index-scheduler/src/index => index/src}/updates.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 9923865d4..e18946125 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1729,10 +1729,9 @@ dependencies = [ ] [[package]] -name = "index-scheduler" +name = "index" version = "0.1.0" dependencies = [ - "actix-rt", "anyhow", "bincode", "csv", @@ -1745,17 +1744,33 @@ dependencies = [ "log", "meilisearch-types", "milli 0.33.0", - "mockall", "nelson", "obkv", - "paste", "permissive-json-pointer", "proptest", "proptest-derive", "regex", - "roaring 0.9.0", "serde", "serde_json", + "thiserror", + "time", + "uuid 1.1.2", +] + +[[package]] +name = "index-scheduler" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "csv", + "file-store", + "index", + "log", + "milli 0.33.0", + "nelson", + "roaring 0.9.0", + "serde", "tempfile", "thiserror", "time", diff --git a/Cargo.toml b/Cargo.toml index 49122460d..28a0e8742 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "meilisearch-lib", "meilisearch-auth", "index-scheduler", + "index", "file-store", "permissive-json-pointer", ] diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 6a512a164..45d21e0ec 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -9,31 +9,16 @@ edition = "2021" anyhow = "1.0.64" bincode = "1.3.3" csv = "1.1.6" -derivative = "2.2.0" -either = { version = "1.6.1", features = ["serde"] } file-store = { path = "../file-store" } -fst = "0.4.7" -indexmap = { version = "1.8.0", features = ["serde-1"] } -lazy_static = "1.4.0" log = "0.4.14" -meilisearch-types = { path = "../meilisearch-types" } milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } -obkv = "0.2.0" -permissive-json-pointer = { path = "../permissive-json-pointer" } -regex = "1.5.5" +index = { path = "../index" } roaring = "0.9.0" serde = { version = "1.0.136", features = ["derive"] } -serde_json = { version = "1.0.85", features = ["preserve_order"] } tempfile = "3.3.0" thiserror = "1.0.30" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "1.1.2", features = ["serde", "v4"] } [dev-dependencies] -actix-rt = "2.7.0" -meilisearch-types = { path = "../meilisearch-types", features = ["test-traits"] } -mockall = "0.11.0" nelson = { git = "https://github.com/meilisearch/nelson.git", rev = "675f13885548fb415ead8fbb447e9e6d9314000a"} -paste = "1.0.6" -proptest = "1.0.0" -proptest-derive = "0.3.0" diff --git a/index-scheduler/src/document_formats.rs b/index-scheduler/src/document_formats.rs deleted file mode 100644 index ebc98f3fb..000000000 --- a/index-scheduler/src/document_formats.rs +++ /dev/null @@ -1,155 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug, Display}; -use std::io::{self, BufReader, Read, Seek, Write}; - -use either::Either; -use meilisearch_types::error::{Code, ErrorCode}; -use meilisearch_types::internal_error; -use milli::documents::{DocumentsBatchBuilder, Error}; -use milli::Object; -use serde::Deserialize; - -type Result = std::result::Result; - -#[derive(Debug)] -pub enum PayloadType { - Ndjson, - Json, - Csv, -} - -impl fmt::Display for PayloadType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PayloadType::Ndjson => f.write_str("ndjson"), - PayloadType::Json => f.write_str("json"), - PayloadType::Csv => f.write_str("csv"), - } - } -} - -#[derive(Debug)] -pub enum DocumentFormatError { - Internal(Box), - MalformedPayload(Error, PayloadType), -} - -impl Display for DocumentFormatError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e), - Self::MalformedPayload(me, b) => match me.borrow() { - Error::Json(se) => { - // https://github.com/meilisearch/meilisearch/issues/2107 - // The user input maybe insanely long. We need to truncate it. - let mut serde_msg = se.to_string(); - let ellipsis = "..."; - if serde_msg.len() > 100 + ellipsis.len() { - serde_msg.replace_range(50..serde_msg.len() - 85, ellipsis); - } - - write!( - f, - "The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.", - b, serde_msg - ) - } - _ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me), - }, - } - } -} - -impl std::error::Error for DocumentFormatError {} - -impl From<(PayloadType, Error)> for DocumentFormatError { - fn from((ty, error): (PayloadType, Error)) -> Self { - match error { - Error::Io(e) => Self::Internal(Box::new(e)), - e => Self::MalformedPayload(e, ty), - } - } -} - -impl ErrorCode for DocumentFormatError { - fn error_code(&self) -> Code { - match self { - DocumentFormatError::Internal(_) => Code::Internal, - DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload, - } - } -} - -internal_error!(DocumentFormatError: io::Error); - -/// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - - let csv = csv::Reader::from_reader(input); - builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?; - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} - -/// Reads JSON Lines from input and write an obkv batch to writer. -pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - let reader = BufReader::new(input); - - for result in serde_json::Deserializer::from_reader(reader).into_iter() { - let object = result - .map_err(Error::Json) - .map_err(|e| (PayloadType::Ndjson, e))?; - builder - .append_json_object(&object) - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - } - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} - -/// Reads JSON from input and write an obkv batch to writer. -pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); - let reader = BufReader::new(input); - - #[derive(Deserialize, Debug)] - #[serde(transparent)] - struct ArrayOrSingleObject { - #[serde(with = "either::serde_untagged")] - inner: Either, Object>, - } - - let content: ArrayOrSingleObject = serde_json::from_reader(reader) - .map_err(Error::Json) - .map_err(|e| (PayloadType::Json, e))?; - - for object in content.inner.map_right(|o| vec![o]).into_inner() { - builder - .append_json_object(&object) - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - } - - let count = builder.documents_count(); - let _ = builder - .into_inner() - .map_err(Into::into) - .map_err(DocumentFormatError::Internal)?; - - Ok(count as usize) -} diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index faf63497c..10bf90974 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -1,8 +1,6 @@ use milli::heed; use thiserror::Error; -use crate::index; - #[derive(Error, Debug)] pub enum Error { #[error("Index `{0}` not found")] diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d6c4d27b6..3cece80f2 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1,8 +1,6 @@ mod autobatcher; mod batch; -mod document_formats; pub mod error; -pub mod index; pub mod task; mod utils; diff --git a/index/Cargo.toml b/index/Cargo.toml new file mode 100644 index 000000000..008d25c28 --- /dev/null +++ b/index/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "index" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.64" +bincode = "1.3.3" +csv = "1.1.6" +derivative = "2.2.0" +either = { version = "1.6.1", features = ["serde"] } +fst = "0.4.7" +indexmap = { version = "1.8.0", features = ["serde-1"] } +lazy_static = "1.4.0" +log = "0.4.14" +meilisearch-types = { path = "../meilisearch-types" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" } +obkv = "0.2.0" +permissive-json-pointer = { path = "../permissive-json-pointer" } +regex = "1.5.5" +serde = { version = "1.0.136", features = ["derive"] } +serde_json = { version = "1.0.85", features = ["preserve_order"] } +thiserror = "1.0.30" +time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } +file-store = { path = "../file-store" } +uuid = { version = "1.1.2", features = ["serde", "v4"] } + +[dev-dependencies] +nelson = { git = "https://github.com/meilisearch/nelson.git", rev = "675f13885548fb415ead8fbb447e9e6d9314000a"} +proptest = "1.0.0" +proptest-derive = "0.3.0" diff --git a/index-scheduler/src/index/dump.rs b/index/src/dump.rs similarity index 100% rename from index-scheduler/src/index/dump.rs rename to index/src/dump.rs diff --git a/index-scheduler/src/index/error.rs b/index/src/error.rs similarity index 100% rename from index-scheduler/src/index/error.rs rename to index/src/error.rs diff --git a/index-scheduler/src/index/index.rs b/index/src/index.rs similarity index 99% rename from index-scheduler/src/index/index.rs rename to index/src/index.rs index 7ee4b712b..1b3494a18 100644 --- a/index-scheduler/src/index/index.rs +++ b/index/src/index.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use time::OffsetDateTime; -use crate::index::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS; +use crate::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use super::error::IndexError; use super::error::Result; diff --git a/index-scheduler/src/index/mod.rs b/index/src/lib.rs similarity index 97% rename from index-scheduler/src/index/mod.rs rename to index/src/lib.rs index cd9ed1b69..9a5d01a54 100644 --- a/index-scheduler/src/index/mod.rs +++ b/index/src/lib.rs @@ -12,10 +12,10 @@ pub mod updates; #[allow(clippy::module_inception)] mod index; -pub use index::{Document, IndexMeta, IndexStats}; +pub use self::index::{Document, IndexMeta, IndexStats}; #[cfg(not(test))] -pub use index::Index; +pub use self::index::Index; #[cfg(test)] pub use test::MockIndex as Index; @@ -37,7 +37,7 @@ pub mod test { use super::index::Index; use super::Document; use super::{Checked, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings}; - use crate::update_file_store::UpdateFileStore; + use file_store::FileStore; #[derive(Clone)] pub enum MockIndex { @@ -164,7 +164,7 @@ pub mod test { &self, method: IndexDocumentsMethod, primary_key: Option, - file_store: UpdateFileStore, + file_store: FileStore, contents: impl Iterator, ) -> Result>> { match self { diff --git a/index/src/main.rs b/index/src/main.rs new file mode 100644 index 000000000..e7a11a969 --- /dev/null +++ b/index/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} diff --git a/index-scheduler/src/index/search.rs b/index/src/search.rs similarity index 99% rename from index-scheduler/src/index/search.rs rename to index/src/search.rs index 57171d529..e53bb6476 100644 --- a/index-scheduler/src/index/search.rs +++ b/index/src/search.rs @@ -13,7 +13,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; -use crate::index::error::FacetError; +use crate::error::FacetError; use super::error::{IndexError, Result}; use super::index::Index; diff --git a/index-scheduler/src/index/updates.rs b/index/src/updates.rs similarity index 100% rename from index-scheduler/src/index/updates.rs rename to index/src/updates.rs