From 77405cc10375cecf18d367cabdf8f3039f6c6ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 16:58:55 +0100 Subject: [PATCH] chore: Remove the database module from meilidb --- meilidb/Cargo.toml | 32 +- meilidb/src/database/config.rs | 46 - meilidb/src/database/document_key.rs | 149 --- meilidb/src/database/mod.rs | 899 ------------------ meilidb/src/database/number.rs | 98 -- meilidb/src/database/schema.rs | 319 ------- meilidb/src/database/serde/deserializer.rs | 186 ---- meilidb/src/database/serde/find_id.rs | 243 ----- .../src/database/serde/indexer_serializer.rs | 190 ---- meilidb/src/database/serde/key_to_string.rs | 146 --- meilidb/src/database/serde/mod.rs | 65 -- meilidb/src/database/serde/serializer.rs | 282 ------ meilidb/src/database/serde/value_to_number.rs | 176 ---- meilidb/src/database/update/index_event.rs | 55 -- meilidb/src/database/update/mod.rs | 234 ----- .../src/database/update/ranked_map_event.rs | 58 -- meilidb/src/database/view.rs | 199 ---- meilidb/src/lib.rs | 3 - 18 files changed, 3 insertions(+), 3377 deletions(-) delete mode 100644 meilidb/src/database/config.rs delete mode 100644 meilidb/src/database/document_key.rs delete mode 100644 meilidb/src/database/mod.rs delete mode 100644 meilidb/src/database/number.rs delete mode 100644 meilidb/src/database/schema.rs delete mode 100644 meilidb/src/database/serde/deserializer.rs delete mode 100644 meilidb/src/database/serde/find_id.rs delete mode 100644 meilidb/src/database/serde/indexer_serializer.rs delete mode 100644 meilidb/src/database/serde/key_to_string.rs delete mode 100644 meilidb/src/database/serde/mod.rs delete mode 100644 meilidb/src/database/serde/serializer.rs delete mode 100644 meilidb/src/database/serde/value_to_number.rs delete mode 100644 meilidb/src/database/update/index_event.rs delete mode 100644 meilidb/src/database/update/mod.rs delete mode 100644 meilidb/src/database/update/ranked_map_event.rs delete mode 100644 meilidb/src/database/view.rs diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 9f12a3a2f..8dc6f0db5 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -5,39 +5,13 @@ version = "0.3.1" authors = ["Kerollmops "] [dependencies] -arc-swap = "0.3.7" -bincode = "1.1.2" -byteorder = "1.3.1" -fst = "0.3.3" -hashbrown = { version = "0.1.8", features = ["serde"] } -linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } -lockfree = "0.5.1" -log = "0.4.6" meilidb-core = { path = "../meilidb-core", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } -sdset = "0.3.1" -serde = "1.0.88" -serde_derive = "1.0.88" -serde_json = { version = "1.0.38", features = ["preserve_order"] } -size_format = "1.0.2" -slice-group-by = "0.2.4" -toml = { version = "0.5.0", features = ["preserve_order"] } -unidecode = "0.3.0" - -[dependencies.rocksdb] -git = "https://github.com/pingcap/rust-rocksdb.git" -rev = "306e201" [features] -default = ["simd"] -i128 = ["bincode/i128", "meilidb-core/i128"] -portable = ["rocksdb/portable"] -simd = ["rocksdb/sse"] -nightly = [ - "hashbrown/nightly", - "slice-group-by/nightly", - "meilidb-core/nightly" -] +default = [] +i128 = ["meilidb-core/i128"] +nightly = ["meilidb-core/nightly"] [dev-dependencies] csv = "1.0.5" diff --git a/meilidb/src/database/config.rs b/meilidb/src/database/config.rs deleted file mode 100644 index 491cdba93..000000000 --- a/meilidb/src/database/config.rs +++ /dev/null @@ -1,46 +0,0 @@ -use std::collections::{HashSet, HashMap}; -use serde_derive::{Serialize, Deserialize}; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum RankingOrdering { - Asc, - Dsc -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct AccessToken { - pub read_key: String, - pub write_key: String, - pub admin_key: String, -} - - -#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct Config { - pub stop_words: Option>, - pub ranking_order: Option>, - pub distinct_field: Option, - pub ranking_rules: Option>, - pub access_token: Option, -} - -impl Config { - pub fn update_with(&mut self, new: Config) { - if let Some(stop_words) = new.stop_words { - self.stop_words = Some(stop_words); - }; - if let Some(ranking_order) = new.ranking_order { - self.ranking_order = Some(ranking_order); - }; - if let Some(distinct_field) = new.distinct_field { - self.distinct_field = Some(distinct_field); - }; - if let Some(ranking_rules) = new.ranking_rules { - self.ranking_rules = Some(ranking_rules); - }; - if let Some(access_token) = new.access_token { - self.access_token = Some(access_token); - }; - } -} diff --git a/meilidb/src/database/document_key.rs b/meilidb/src/database/document_key.rs deleted file mode 100644 index d6b9865ef..000000000 --- a/meilidb/src/database/document_key.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::io::{Cursor, Read, Write}; -use std::mem::size_of; -use std::fmt; - -use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; - -use crate::database::schema::SchemaAttr; -use meilidb_core::DocumentId; - -const DOC_KEY_LEN: usize = 4 + size_of::(); -const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::(); - -#[derive(Copy, Clone)] -pub struct DocumentKey([u8; DOC_KEY_LEN]); - -impl DocumentKey { - pub fn new(id: DocumentId) -> DocumentKey { - let mut buffer = [0; DOC_KEY_LEN]; - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(b"doc-").unwrap(); - wtr.write_u64::(id.0).unwrap(); - - DocumentKey(buffer) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey { - assert!(bytes.len() >= DOC_KEY_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKey(buffer) - } - - pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), attr) - } - - pub fn with_attribute_min(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::min()) - } - - pub fn with_attribute_max(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::max()) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } -} - -impl AsRef<[u8]> for DocumentKey { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKey { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKey") - .field("document_id", &self.document_id()) - .finish() - } -} - -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]); - -impl DocumentKeyAttr { - pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr { - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - let DocumentKey(raw_key) = DocumentKey::new(id); - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(&raw_key).unwrap(); - wtr.write_all(b"-").unwrap(); - wtr.write_u16::(attr.0).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::min()) - } - - pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::max()) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr { - assert!(bytes.len() >= DOC_KEY_ATTR_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } - - pub fn attribute(&self) -> SchemaAttr { - let offset = 4 + size_of::() + 1; - let value = (&self.0[offset..]).read_u16::().unwrap(); - SchemaAttr::new(value) - } - - pub fn into_document_key(self) -> DocumentKey { - DocumentKey::new(self.document_id()) - } -} - -impl AsRef<[u8]> for DocumentKeyAttr { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKeyAttr { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKeyAttr") - .field("document_id", &self.document_id()) - .field("attribute", &self.attribute().0) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn keep_as_ref_order() { - for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) { - let id = DocumentId(0); - let a = DocumentKeyAttr::new(id, SchemaAttr(a)); - let b = DocumentKeyAttr::new(id, SchemaAttr(b)); - - assert!(a < b); - assert!(a.as_ref() < b.as_ref()); - } - } -} diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs deleted file mode 100644 index 16ba148b7..000000000 --- a/meilidb/src/database/mod.rs +++ /dev/null @@ -1,899 +0,0 @@ -use std::time::Instant; -use std::error::Error; -use std::ffi::OsStr; -use std::sync::Arc; -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::ops::{Deref, DerefMut}; - -use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{Writable, Snapshot}; -use rocksdb::{DB, MergeOperands}; -use size_format::SizeFormatterBinary; -use arc_swap::ArcSwap; -use lockfree::map::Map; -use hashbrown::HashMap; -use log::{info, error, warn}; - -use crate::database::schema::SchemaAttr; -use meilidb_core::shared_data_cursor::FromSharedDataCursor; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::{Index, DocumentId}; - -use self::update::{ReadIndexEvent, ReadRankedMapEvent}; - -pub use self::config::Config; -pub use self::document_key::{DocumentKey, DocumentKeyAttr}; -pub use self::view::{DatabaseView, DocumentIter}; -pub use self::update::Update; -pub use self::serde::SerializerError; -pub use self::schema::Schema; -pub use self::number::{Number, ParseNumberError}; - -pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; - -const DATA_INDEX: &[u8] = b"data-index"; -const DATA_RANKED_MAP: &[u8] = b"data-ranked-map"; -const DATA_SCHEMA: &[u8] = b"data-schema"; -const CONFIG: &[u8] = b"config"; - -pub mod config; -pub mod schema; -mod number; -mod document_key; -mod serde; -mod update; -mod view; - -fn retrieve_data_schema(snapshot: &Snapshot) -> Result> -where D: Deref -{ - match snapshot.get(DATA_SCHEMA)? { - Some(vector) => Ok(Schema::read_from_bin(&*vector)?), - None => Err(String::from("BUG: no schema found in the database").into()), - } -} - -fn retrieve_data_index(snapshot: &Snapshot) -> Result> -where D: Deref -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_INDEX)?; - info!("loading index from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadIndexEvent::from_bytes(bytes)?; - let index = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading index from bytes took {:.2?}", start.elapsed()); - - Ok(index) - }, - None => Ok(Index::default()), - } -} - -fn retrieve_data_ranked_map(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_RANKED_MAP)?; - info!("loading ranked map from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadRankedMapEvent::from_bytes(bytes)?; - let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading ranked map from bytes took {:.2?}", start.elapsed()); - - Ok(ranked_map) - }, - None => Ok(RankedMap::new()), - } -} - -fn retrieve_config(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - match snapshot.get(CONFIG)? { - Some(vector) => Ok(bincode::deserialize(&*vector)?), - None => Ok(Config::default()), - } -} - -fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadIndexEvent::{self, *}; - use self::update::WriteIndexEvent; - - let mut index = Index::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => index = index.remove_documents(d.as_ref()), - UpdatedDocuments(i) => index = index.union(&i), - } - } - - WriteIndexEvent::UpdatedDocuments(&index).into_bytes() -} - -fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadRankedMapEvent::{self, *}; - use self::update::WriteRankedMapEvent; - - let mut ranked_map = RankedMap::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()), - UpdatedDocuments(i) => ranked_map.extend(i), - } - } - - WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes() -} - -fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - match key { - DATA_INDEX => merge_indexes(existing, operands), - DATA_RANKED_MAP => merge_ranked_maps(existing, operands), - key => panic!("The merge operator does not support merging {:?}", key), - } -} - -pub struct IndexUpdate { - index: String, - update: Update, -} - -impl Deref for IndexUpdate { - type Target = Update; - - fn deref(&self) -> &Update { - &self.update - } -} - -impl DerefMut for IndexUpdate { - fn deref_mut(&mut self) -> &mut Update { - &mut self.update - } -} - -struct DatabaseIndex { - db: Arc, - - // This view is updated each time the DB ingests an update. - view: ArcSwap>>, - - // The path of the mdb folder stored on disk. - path: PathBuf, - - // must_die false by default, must be set as true when the Index is dropped. - // It is used to erase the folder saved on disk when the user request to delete an index. - must_die: AtomicBool, -} - -impl DatabaseIndex { - fn create>(path: P, schema: &Schema) -> Result> { - let path = path.as_ref(); - if path.exists() { - return Err(format!("File already exists at path: {}, cannot create database.", - path.display()).into()) - } - - let path_lossy = path.to_string_lossy(); - let mut opts = DBOptions::new(); - opts.create_if_missing(true); - // opts.error_if_exists(true); // FIXME pull request that - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - let mut schema_bytes = Vec::new(); - schema.write_to_bin(&mut schema_bytes)?; - db.put(DATA_SCHEMA, &schema_bytes)?; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn open>(path: P) -> Result> { - let path_lossy = path.as_ref().to_string_lossy(); - - let mut opts = DBOptions::new(); - opts.create_if_missing(false); - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - // FIXME create a generic function to do that ! - let _schema = match db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => return Err(String::from("Database does not contain a schema").into()), - }; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.as_ref().to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn must_die(&self) { - self.must_die.store(true, Ordering::Relaxed) - } - - fn start_update(&self) -> Result> { - let schema = match self.db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => panic!("Database does not contain a schema"), - }; - - Ok(Update::new(schema)) - } - - fn commit_update(&self, update: Update) -> Result>>, Box> { - let batch = update.build()?; - self.db.write(batch)?; - self.db.compact_range(None, None); - self.db.flush(true)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn view(&self) -> Arc>> { - self.view.load() - } - - fn get_config(&self) -> Config { - self.view().config().clone() - } - - fn update_config(&self, config: Config) -> Result>>, Box>{ - let data = bincode::serialize(&config)?; - self.db.put(CONFIG, &data)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn path(&self) -> &Path { - self.path.as_path() - } -} - -impl Drop for DatabaseIndex { - fn drop(&mut self) { - if self.must_die.load(Ordering::Relaxed) { - if let Err(err) = fs::remove_dir_all(&self.path) { - error!("Impossible to remove mdb when Database is dropped; {}", err); - } - } - } -} - -pub struct Database { - indexes: Map>, - path: PathBuf, -} - -impl Database { - pub fn create>(path: P) -> Result> { - Ok(Database { - indexes: Map::new(), - path: path.as_ref().to_path_buf(), - }) - } - - pub fn open>(path: P) -> Result> { - let entries = fs::read_dir(&path)?; - - let indexes = Map::new(); - for entry in entries { - let path = match entry { - Ok(p) => p.path(), - Err(err) => { - warn!("Impossible to retrieve the path from an entry; {}", err); - continue - } - }; - - let name = match path.file_stem().and_then(OsStr::to_str) { - Some(name) => name.to_owned(), - None => continue - }; - - let db = match DatabaseIndex::open(path.clone()) { - Ok(db) => db, - Err(err) => { - warn!("Impossible to open the database; {}", err); - continue - } - }; - - info!("Load database {}", name); - indexes.insert(name, Arc::new(db)); - } - - Ok(Database { - indexes: indexes, - path: path.as_ref().to_path_buf(), - }) - } - - pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box> { - let index_path = self.path.join(name); - - if index_path.exists() { - return Err("Index already exists".into()); - } - - let index = DatabaseIndex::create(index_path, schema)?; - self.indexes.insert(name.to_owned(), Arc::new(index)); - - Ok(()) - } - - pub fn delete_index(&self, name: &str) -> Result<(), Box> { - let index_guard = self.indexes.remove(name).ok_or("Index not found")?; - index_guard.val().must_die(); - - Ok(()) - } - - pub fn list_indexes(&self) -> Vec { - self.indexes.iter().map(|g| g.key().clone()).collect() - } - - pub fn start_update(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let update = index_guard.val().start_update()?; - - Ok(IndexUpdate { index: index.to_owned(), update }) - } - - pub fn commit_update(&self, update: IndexUpdate)-> Result>>, Box> { - let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?; - - index_guard.val().commit_update(update.update) - } - - pub fn view(&self, index: &str) -> Result>>, Box> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().view()) - } - - pub fn get_config(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().get_config()) - } - - pub fn update_config(&self, index: &str, config: Config) -> Result>>, Box>{ - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().update_config(config)?) - } - - pub fn path(&self) -> &Path { - self.path.as_path() - } - - pub fn index_path(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let path = index_guard.val().path(); - Ok(path.to_path_buf()) - } - -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - use std::error::Error; - - use serde_derive::{Serialize, Deserialize}; - - use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - - use super::*; - - #[test] - fn ingest_one_easy_update() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - - let mut builder = database.start_update(meilidb_index_name)?; - - let docid0 = builder.update_document(&doc0, &stop_words)?; - let docid1 = builder.update_document(&doc1, &stop_words)?; - - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - Ok(dir.close()?) - } - - #[test] - fn ingest_two_easy_updates() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - let doc2 = SimpleDoc { - id: 2, - title: String::from("I am the third title"), - description: String::from("I am the third description"), - timestamp: 7654321, - }; - let doc3 = SimpleDoc { - id: 3, - title: String::from("I am the fourth title"), - description: String::from("I am the fourth description"), - timestamp: 7654321, - }; - - let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &stop_words)?; - let docid1 = builder.update_document(&doc1, &stop_words)?; - database.commit_update(builder)?; - - let mut builder = database.start_update(meilidb_index_name)?; - let docid2 = builder.update_document(&doc2, &stop_words)?; - let docid3 = builder.update_document(&doc3, &stop_words)?; - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - let de_doc2: SimpleDoc = view.document_by_id(docid2)?; - let de_doc3: SimpleDoc = view.document_by_id(docid3)?; - - assert_eq!(doc2, de_doc2); - assert_eq!(doc3, de_doc3); - - Ok(dir.close()?) - } -} - -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate test; - - use std::collections::HashSet; - use std::error::Error; - use std::iter::repeat_with; - use self::test::Bencher; - - use rand::distributions::Alphanumeric; - use rand_xorshift::XorShiftRng; - use rand::{Rng, SeedableRng}; - use serde_derive::Serialize; - use rand::seq::SliceRandom; - - use crate::tokenizer::DefaultBuilder; - use crate::database::schema::*; - - use super::*; - - fn random_sentences(number: usize, rng: &mut R) -> String { - let mut words = String::new(); - - for i in 0..number { - let word_len = rng.gen_range(1, 12); - let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len); - words.extend(iter); - - if i == number - 1 { // last word - let final_ = [".", "?", "!", "..."].choose(rng).cloned(); - words.extend(final_); - } else { - let middle = [",", ", "].choose(rng).cloned(); - words.extend(middle); - } - } - - words - } - - #[bench] - fn open_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn open_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn open_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } -} diff --git a/meilidb/src/database/number.rs b/meilidb/src/database/number.rs deleted file mode 100644 index b2c4c9a88..000000000 --- a/meilidb/src/database/number.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::cmp::Ordering; -use std::str::FromStr; -use std::fmt; - -use serde_derive::{Serialize, Deserialize}; - -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone)] -pub enum Number { - Unsigned(u64), - Signed(i64), - Float(f64), -} - -impl FromStr for Number { - type Err = ParseNumberError; - - fn from_str(s: &str) -> Result { - if let Ok(unsigned) = u64::from_str(s) { - return Ok(Number::Unsigned(unsigned)) - } - - if let Ok(signed) = i64::from_str(s) { - return Ok(Number::Signed(signed)) - } - - if let Ok(float) = f64::from_str(s) { - if float == 0.0 || float.is_normal() { - return Ok(Number::Float(float)) - } - } - - Err(ParseNumberError) - } -} - -impl PartialOrd for Number { - fn partial_cmp(&self, other: &Number) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Number { - fn cmp(&self, other: &Number) -> Ordering { - use Number::*; - match (self, other) { - (Unsigned(s), Unsigned(o)) => s.cmp(o), - (Unsigned(s), Signed(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Unsigned(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - - (Signed(s), Unsigned(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Signed(s), Signed(o)) => s.cmp(o), - (Signed(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - - (Float(s), Unsigned(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Signed(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Float(o)) => { - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - } - } -} - -impl PartialEq for Number { - fn eq(&self, other: &Number) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for Number { } - -pub struct ParseNumberError; - -impl fmt::Display for ParseNumberError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("can not parse number") - } -} diff --git a/meilidb/src/database/schema.rs b/meilidb/src/database/schema.rs deleted file mode 100644 index b4e0a070c..000000000 --- a/meilidb/src/database/schema.rs +++ /dev/null @@ -1,319 +0,0 @@ -use std::collections::{HashMap, BTreeMap}; -use std::io::{Read, Write}; -use std::error::Error; -use std::{fmt, u16}; -use std::ops::BitOr; -use std::sync::Arc; - -use serde_derive::{Serialize, Deserialize}; -use linked_hash_map::LinkedHashMap; - -use crate::database::serde::find_id::FindDocumentIdSerializer; -use crate::database::serde::SerializerError; -use meilidb_core::DocumentId; - -pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; -pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; -pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct SchemaProps { - #[serde(default)] - stored: bool, - - #[serde(default)] - indexed: bool, - - #[serde(default)] - ranked: bool, -} - -impl SchemaProps { - pub fn is_stored(self) -> bool { - self.stored - } - - pub fn is_indexed(self) -> bool { - self.indexed - } - - pub fn is_ranked(self) -> bool { - self.ranked - } -} - -impl BitOr for SchemaProps { - type Output = Self; - - fn bitor(self, other: Self) -> Self::Output { - SchemaProps { - stored: self.stored | other.stored, - indexed: self.indexed | other.indexed, - ranked: self.ranked | other.ranked, - } - } -} - -#[derive(Serialize, Deserialize)] -pub struct SchemaBuilder { - identifier: String, - attributes: LinkedHashMap, -} - -impl SchemaBuilder { - pub fn with_identifier>(name: S) -> SchemaBuilder { - SchemaBuilder { - identifier: name.into(), - attributes: LinkedHashMap::new(), - } - } - - pub fn new_attribute>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { - let len = self.attributes.len(); - if self.attributes.insert(name.into(), props).is_some() { - panic!("Field already inserted.") - } - SchemaAttr(len as u16) - } - - pub fn build(self) -> Schema { - let mut attrs = HashMap::new(); - let mut props = Vec::new(); - - for (i, (name, prop)) in self.attributes.into_iter().enumerate() { - attrs.insert(name.clone(), SchemaAttr(i as u16)); - props.push((name, prop)); - } - - let identifier = self.identifier; - Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Schema { - inner: Arc, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct InnerSchema { - identifier: String, - attrs: HashMap, - props: Vec<(String, SchemaProps)>, -} - -impl Schema { - pub fn from_toml(mut reader: R) -> Result> { - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - let builder: SchemaBuilder = toml::from_slice(&buffer)?; - Ok(builder.build()) - } - - pub fn to_toml(&self, mut writer: W) -> Result<(), Box> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - - let string = toml::to_string_pretty(&builder)?; - writer.write_all(string.as_bytes())?; - - Ok(()) - } - - pub fn from_json(mut reader: R) -> Result> { - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - let builder: SchemaBuilder = serde_json::from_slice(&buffer)?; - Ok(builder.build()) - } - - pub fn to_json(&self, mut writer: W) -> Result<(), Box> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - let string = serde_json::to_string_pretty(&builder)?; - writer.write_all(string.as_bytes())?; - - Ok(()) - } - - pub(crate) fn read_from_bin(reader: R) -> bincode::Result { - let builder: SchemaBuilder = bincode::deserialize_from(reader)?; - Ok(builder.build()) - } - - pub(crate) fn write_to_bin(&self, writer: W) -> bincode::Result<()> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - - bincode::serialize_into(writer, &builder) - } - - fn attributes_ordered(&self) -> LinkedHashMap { - let mut ordered = BTreeMap::new(); - for (name, attr) in &self.inner.attrs { - let (_, props) = self.inner.props[attr.0 as usize]; - ordered.insert(attr.0, (name, props)); - } - - let mut attributes = LinkedHashMap::with_capacity(ordered.len()); - for (_, (name, props)) in ordered { - attributes.insert(name.clone(), props); - } - - attributes - } - - pub fn document_id(&self, document: T) -> Result - where T: serde::Serialize, - { - let id_attribute_name = &self.inner.identifier; - let serializer = FindDocumentIdSerializer { id_attribute_name }; - document.serialize(serializer) - } - - pub fn props(&self, attr: SchemaAttr) -> SchemaProps { - let (_, props) = self.inner.props[attr.0 as usize]; - props - } - - pub fn identifier_name(&self) -> &str { - &self.inner.identifier - } - - pub fn attribute>(&self, name: S) -> Option { - self.inner.attrs.get(name.as_ref()).cloned() - } - - pub fn attribute_name(&self, attr: SchemaAttr) -> &str { - let (name, _) = &self.inner.props[attr.0 as usize]; - name - } -} - -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct SchemaAttr(pub u16); - -impl SchemaAttr { - pub fn new(value: u16) -> SchemaAttr { - SchemaAttr(value) - } - - pub fn min() -> SchemaAttr { - SchemaAttr(0) - } - - pub fn next(self) -> Option { - self.0.checked_add(1).map(SchemaAttr) - } - - pub fn prev(self) -> Option { - self.0.checked_sub(1).map(SchemaAttr) - } - - pub fn max() -> SchemaAttr { - SchemaAttr(u16::MAX) - } -} - -impl fmt::Display for SchemaAttr { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.0.fmt(f) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::error::Error; - - #[test] - fn serialize_deserialize() -> bincode::Result<()> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - - schema.write_to_bin(&mut buffer)?; - let schema2 = Schema::read_from_bin(buffer.as_slice())?; - - assert_eq!(schema, schema2); - - Ok(()) - } - - #[test] - fn serialize_deserialize_toml() -> Result<(), Box> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - schema.to_toml(&mut buffer)?; - - let schema2 = Schema::from_toml(buffer.as_slice())?; - assert_eq!(schema, schema2); - - let data = r#" - identifier = "id" - - [attributes."alpha"] - stored = true - - [attributes."beta"] - stored = true - indexed = true - - [attributes."gamma"] - indexed = true - "#; - let schema2 = Schema::from_toml(data.as_bytes())?; - assert_eq!(schema, schema2); - - Ok(()) - } - - #[test] - fn serialize_deserialize_json() -> Result<(), Box> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - schema.to_json(&mut buffer)?; - - let schema2 = Schema::from_json(buffer.as_slice())?; - assert_eq!(schema, schema2); - - let data = r#" - { - "identifier": "id", - "attributes": { - "alpha": { - "stored": true - }, - "beta": { - "stored": true, - "indexed": true - }, - "gamma": { - "indexed": true - } - } - }"#; - let schema2 = Schema::from_json(data.as_bytes())?; - assert_eq!(schema, schema2); - - Ok(()) - } -} diff --git a/meilidb/src/database/serde/deserializer.rs b/meilidb/src/database/serde/deserializer.rs deleted file mode 100644 index 92374ab48..000000000 --- a/meilidb/src/database/serde/deserializer.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::error::Error; -use std::ops::Deref; -use std::fmt; - -use rocksdb::rocksdb::{DB, Snapshot, SeekKey}; -use rocksdb::rocksdb_options::ReadOptions; -use serde::forward_to_deserialize_any; -use serde::de::value::MapDeserializer; -use serde::de::{self, Visitor, IntoDeserializer}; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::schema::Schema; -use meilidb_core::DocumentId; - -pub struct Deserializer<'a, D> -where D: Deref -{ - snapshot: &'a Snapshot, - schema: &'a Schema, - document_id: DocumentId, -} - -impl<'a, D> Deserializer<'a, D> -where D: Deref -{ - pub fn new(snapshot: &'a Snapshot, schema: &'a Schema, doc: DocumentId) -> Self { - Deserializer { snapshot, schema, document_id: doc } - } -} - -impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D> -where D: Deref -{ - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_map(visitor) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq - bytes byte_buf unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } - - fn deserialize_map(self, visitor: V) -> Result - where V: Visitor<'de> - { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(self.document_id); - let upper = lower.with_attribute_max(); - options.set_iterate_lower_bound(lower.as_ref()); - options.set_iterate_upper_bound(upper.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - - if iter.kv().is_none() { - // FIXME return an error - } - - let iter = iter.map(|(key, value)| { - // retrieve the schema attribute name - // from the schema attribute number - let document_key_attr = DocumentKeyAttr::from_bytes(&key); - let schema_attr = document_key_attr.attribute(); - let attribute_name = self.schema.attribute_name(schema_attr); - (attribute_name, Value(value)) - }); - - let map_deserializer = MapDeserializer::new(iter); - visitor.visit_map(map_deserializer) - } -} - -struct Value(Vec); - -impl<'de> IntoDeserializer<'de, DeserializerError> for Value { - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -macro_rules! forward_to_bincode_values { - ($($ty:ident => $de_method:ident,)*) => { - $( - fn $de_method(self, visitor: V) -> Result - where V: de::Visitor<'de> - { - match bincode::deserialize::<$ty>(&self.0) { - Ok(val) => val.into_deserializer().$de_method(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - )* - } -} - -impl<'de, 'a> de::Deserializer<'de> for Value { - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.0.into_deserializer().deserialize_any(visitor) - } - - fn deserialize_str(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_string(visitor) - } - - fn deserialize_string(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::(&self.0) { - Ok(val) => val.into_deserializer().deserialize_string(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - fn deserialize_bytes(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_byte_buf(visitor) - } - - fn deserialize_byte_buf(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::>(&self.0) { - Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - forward_to_bincode_values! { - char => deserialize_char, - bool => deserialize_bool, - - u8 => deserialize_u8, - u16 => deserialize_u16, - u32 => deserialize_u32, - u64 => deserialize_u64, - - i8 => deserialize_i8, - i16 => deserialize_i16, - i32 => deserialize_i32, - i64 => deserialize_i64, - - f32 => deserialize_f32, - f64 => deserialize_f64, - } - - forward_to_deserialize_any! { - unit seq map - unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } -} - -#[derive(Debug)] -pub enum DeserializerError { - Custom(String), -} - -impl de::Error for DeserializerError { - fn custom(msg: T) -> Self { - DeserializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for DeserializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - DeserializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for DeserializerError {} diff --git a/meilidb/src/database/serde/find_id.rs b/meilidb/src/database/serde/find_id.rs deleted file mode 100644 index 3c44b5e35..000000000 --- a/meilidb/src/database/serde/find_id.rs +++ /dev/null @@ -1,243 +0,0 @@ -use serde::Serialize; -use serde::ser; - -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::{SerializerError, calculate_hash}; -use meilidb_core::DocumentId; - -pub struct FindDocumentIdSerializer<'a> { - pub id_attribute_name: &'a str, -} - -impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = FindDocumentIdMapSerializer<'a>; - type SerializeStruct = FindDocumentIdStructSerializer<'a>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(FindDocumentIdMapSerializer { - id_attribute_name: self.id_attribute_name, - document_id: None, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(FindDocumentIdStructSerializer { - id_attribute_name: self.id_attribute_name, - document_id: None, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -pub struct FindDocumentIdMapSerializer<'a> { - id_attribute_name: &'a str, - document_id: Option, - current_key_name: Option, -} - -impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V - ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - - if self.id_attribute_name == key { - // TODO is it possible to have multiple ids? - let id = bincode::serialize(value).unwrap(); - let hash = calculate_hash(&id); - self.document_id = Some(DocumentId(hash)); - } - - Ok(()) - } - - fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } - } -} - -pub struct FindDocumentIdStructSerializer<'a> { - id_attribute_name: &'a str, - document_id: Option, -} - -impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if self.id_attribute_name == key { - // TODO can it be possible to have multiple ids? - let id = bincode::serialize(value).unwrap(); - let hash = calculate_hash(&id); - self.document_id = Some(DocumentId(hash)); - } - - Ok(()) - } - - fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } - } -} diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs deleted file mode 100644 index ae5a0e4cb..000000000 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ /dev/null @@ -1,190 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; -use meilidb_core::{DocumentId, DocIndex}; -use meilidb_tokenizer::{Tokenizer, Token, is_cjk}; - -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; - -pub struct IndexerSerializer<'a, 'b> { - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub attribute: SchemaAttr, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, v: &str) -> Result { - for token in Tokenizer::new(v) { - let Token { word, word_index, char_index } = token; - let document_id = self.document_id; - - // FIXME must u32::try_from instead - let attribute = self.attribute.0; - let word_index = word_index as u16; - - // insert the exact representation - let word_lower = word.to_lowercase(); - let length = word.chars().count() as u16; - - if self.stop_words.contains(&word_lower) { continue } - - // and the unidecoded lowercased version - if !word_lower.chars().any(is_cjk) { - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - let word_unidecoded = word_unidecoded.trim(); - if word_lower != word_unidecoded { - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?; - } - } - - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?; - } - Ok(()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "seq" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/serde/key_to_string.rs b/meilidb/src/database/serde/key_to_string.rs deleted file mode 100644 index 2fe0c5a39..000000000 --- a/meilidb/src/database/serde/key_to_string.rs +++ /dev/null @@ -1,146 +0,0 @@ -use serde::Serialize; -use serde::ser; - -use crate::database::serde::SerializerError; - -pub struct KeyToStringSerializer; - -impl ser::Serializer for KeyToStringSerializer { - type Ok = String; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, value: &str) -> Result { - Ok(value.to_string()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/serde/mod.rs b/meilidb/src/database/serde/mod.rs deleted file mode 100644 index 493124f7e..000000000 --- a/meilidb/src/database/serde/mod.rs +++ /dev/null @@ -1,65 +0,0 @@ -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; -use std::error::Error; -use std::fmt; - -use serde::ser; - -macro_rules! forward_to_unserializable_type { - ($($ty:ident => $se_method:ident,)*) => { - $( - fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { name: "$ty" }) - } - )* - } -} - -pub mod find_id; -pub mod key_to_string; -pub mod value_to_number; -pub mod serializer; -pub mod indexer_serializer; -pub mod deserializer; - -pub fn calculate_hash(t: &T) -> u64 { - let mut s = DefaultHasher::new(); - t.hash(&mut s); - s.finish() -} - -#[derive(Debug)] -pub enum SerializerError { - DocumentIdNotFound, - UnserializableType { name: &'static str }, - Custom(String), -} - -impl ser::Error for SerializerError { - fn custom(msg: T) -> Self { - SerializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for SerializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - SerializerError::DocumentIdNotFound => { - write!(f, "serialized document does not have an id according to the schema") - } - SerializerError::UnserializableType { name } => { - write!(f, "Only struct and map types are considered valid documents and - can be serialized, not {} types directly.", name) - }, - SerializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for SerializerError {} - -impl From for SerializerError { - fn from(value: String) -> SerializerError { - SerializerError::Custom(value) - } -} diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs deleted file mode 100644 index e1be310ed..000000000 --- a/meilidb/src/database/serde/serializer.rs +++ /dev/null @@ -1,282 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; - -use crate::database::serde::indexer_serializer::IndexerSerializer; -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::value_to_number::ValueToNumberSerializer; -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::database::schema::Schema; -use meilidb_core::DocumentId; - -pub struct Serializer<'a, 'b> { - pub schema: &'a Schema, - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b>; - type SerializeStruct = StructSerializer<'a, 'b>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(MapSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - stop_words: self.stop_words, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(StructSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - stop_words: self.stop_words, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -pub struct MapSerializer<'a, 'b> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub stop_words: &'a HashSet, - pub current_key_name: Option, -} - -impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let number = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, number)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} - -pub struct StructSerializer<'a, 'b> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let integer = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, integer)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} diff --git a/meilidb/src/database/serde/value_to_number.rs b/meilidb/src/database/serde/value_to_number.rs deleted file mode 100644 index a70b92fc4..000000000 --- a/meilidb/src/database/serde/value_to_number.rs +++ /dev/null @@ -1,176 +0,0 @@ -use std::str::FromStr; - -use serde::Serialize; -use serde::{ser, ser::Error}; - -use crate::database::serde::SerializerError; -use crate::database::Number; - -pub struct ValueToNumberSerializer; - -impl ser::Serializer for ValueToNumberSerializer { - type Ok = Number; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - } - - fn serialize_i8(self, value: i8) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i16(self, value: i16) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i32(self, value: i32) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i64(self, value: i64) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_u8(self, value: u8) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u16(self, value: u16) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u32(self, value: u32) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u64(self, value: u64) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_f32(self, value: f32) -> Result { - Ok(Number::Float(value as f64)) - } - - fn serialize_f64(self, value: f64) -> Result { - Ok(Number::Float(value)) - } - - fn serialize_str(self, value: &str) -> Result { - Number::from_str(value).map_err(SerializerError::custom) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/update/index_event.rs b/meilidb/src/database/update/index_event.rs deleted file mode 100644 index 20dbcbf46..000000000 --- a/meilidb/src/database/update/index_event.rs +++ /dev/null @@ -1,55 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; -use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; - -use crate::database::Index; - -pub enum WriteIndexEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a Index), -} - -impl<'a> WriteToBytes for WriteIndexEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteIndexEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteIndexEvent::UpdatedDocuments(index) => { - let _ = bytes.write_u8(1); - index.write_to_bytes(bytes); - } - } - } -} - -pub enum ReadIndexEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(Index), -} - -impl ReadIndexEvent { - pub fn updated_documents(self) -> Option { - use ReadIndexEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(index) => Some(index), - } - } -} - -impl FromSharedDataCursor for ReadIndexEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments), - 1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments), - _ => unreachable!(), - } - } -} diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs deleted file mode 100644 index f34cf6a8e..000000000 --- a/meilidb/src/database/update/mod.rs +++ /dev/null @@ -1,234 +0,0 @@ -use std::collections::{HashSet, BTreeMap}; -use std::error::Error; - -use rocksdb::rocksdb::{Writable, WriteBatch}; -use hashbrown::hash_map::HashMap; -use sdset::{Set, SetBuf}; -use serde::Serialize; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; -use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::serde::serializer::Serializer; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; -use crate::database::schema::Schema; -use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; -use crate::database::{RankedMap, Number}; - -pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; -pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; - -mod index_event; -mod ranked_map_event; - -pub type Token = Vec; // TODO could be replaced by a SmallVec - -pub struct Update { - schema: Schema, - raw_builder: RawUpdateBuilder, -} - -impl Update { - pub(crate) fn new(schema: Schema) -> Update { - Update { schema, raw_builder: RawUpdateBuilder::new() } - } - - pub fn update_document( - &mut self, - document: T, - stop_words: &HashSet, - ) -> Result - where T: Serialize, - { - let document_id = self.schema.document_id(&document)?; - - let serializer = Serializer { - schema: &self.schema, - document_id: document_id, - update: &mut self.raw_builder.document_update(document_id)?, - stop_words: stop_words, - }; - - document.serialize(serializer)?; - - Ok(document_id) - } - - pub fn remove_document(&mut self, document: T) -> Result - where T: Serialize, - { - let document_id = self.schema.document_id(&document)?; - self.raw_builder.document_update(document_id)?.remove()?; - Ok(document_id) - } - - pub(crate) fn build(self) -> Result> { - self.raw_builder.build() - } -} - -#[derive(Copy, Clone, PartialEq, Eq)] -enum UpdateType { - Updated, - Deleted, -} - -use UpdateType::{Updated, Deleted}; - -pub struct RawUpdateBuilder { - documents_update: HashMap, - documents_ranked_fields: RankedMap, - indexed_words: BTreeMap>, - batch: WriteBatch, -} - -impl RawUpdateBuilder { - pub fn new() -> RawUpdateBuilder { - RawUpdateBuilder { - documents_update: HashMap::new(), - documents_ranked_fields: HashMap::new(), - indexed_words: BTreeMap::new(), - batch: WriteBatch::new(), - } - } - - pub fn document_update(&mut self, document_id: DocumentId) -> Result { - use serde::ser::Error; - - match self.documents_update.get(&document_id) { - Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }), - Some(Updated) => Err(SerializerError::custom( - "This document has already been removed and cannot be updated in the same update" - )), - } - } - - pub fn build(self) -> Result> { - // create the list of all the removed documents - let removed_documents = { - let mut document_ids = Vec::new(); - for (id, update_type) in self.documents_update { - if update_type == Deleted { - document_ids.push(id); - } - } - - document_ids.sort_unstable(); - let setbuf = SetBuf::new_unchecked(document_ids); - DocIds::new(&setbuf) - }; - - // create the Index of all the document updates - let index = { - let mut builder = IndexBuilder::new(); - for (key, mut indexes) in self.indexed_words { - indexes.sort_unstable(); - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes).unwrap(); - } - builder.build() - }; - - // WARN: removed documents must absolutely - // be merged *before* document updates - - // === index === - - if !removed_documents.is_empty() { - // remove the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - - // === ranked map === - - if !removed_documents.is_empty() { - // update the ranked map using the appropriate RankedMapEvent - let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - - Ok(self.batch) - } -} - -pub struct DocumentUpdate<'a> { - document_id: DocumentId, - inner: &'a mut RawUpdateBuilder, -} - -impl<'a> DocumentUpdate<'a> { - pub fn remove(&mut self) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) { - return Err(SerializerError::custom( - "This document has already been updated and cannot be removed in the same update" - )); - } - - let start = DocumentKey::new(self.document_id).with_attribute_min(); - let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1 - self.inner.batch.delete_range(start.as_ref(), end.as_ref())?; - - Ok(()) - } - - pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - let key = DocumentKeyAttr::new(self.document_id, attr); - self.inner.batch.put(key.as_ref(), &value)?; - - Ok(()) - } - - pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index); - - Ok(()) - } - - pub fn register_ranked_attribute( - &mut self, - attr: SchemaAttr, - number: Number, - ) -> Result<(), SerializerError> - { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted, ranked attributes cannot be added in the same update" - )); - } - - self.inner.documents_ranked_fields.insert((self.document_id, attr), number); - - Ok(()) - } -} diff --git a/meilidb/src/database/update/ranked_map_event.rs b/meilidb/src/database/update/ranked_map_event.rs deleted file mode 100644 index 428bc62cf..000000000 --- a/meilidb/src/database/update/ranked_map_event.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; -use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; - -use crate::database::RankedMap; - -pub enum WriteRankedMapEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a RankedMap), -} - -impl<'a> WriteToBytes for WriteRankedMapEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteRankedMapEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteRankedMapEvent::UpdatedDocuments(ranked_map) => { - let _ = bytes.write_u8(1); - bincode::serialize_into(bytes, ranked_map).unwrap() - } - } - } -} - -pub enum ReadRankedMapEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(RankedMap), -} - -impl ReadRankedMapEvent { - pub fn updated_documents(self) -> Option { - use ReadRankedMapEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(ranked_map) => Some(ranked_map), - } - } -} - -impl FromSharedDataCursor for ReadRankedMapEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments), - 1 => { - let ranked_map = bincode::deserialize_from(cursor)?; - Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map)) - }, - _ => unreachable!(), - } - } -} diff --git a/meilidb/src/database/view.rs b/meilidb/src/database/view.rs deleted file mode 100644 index fcbb3fea1..000000000 --- a/meilidb/src/database/view.rs +++ /dev/null @@ -1,199 +0,0 @@ -use std::error::Error; -use std::path::Path; -use std::ops::Deref; -use std::{fmt, marker}; - -use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; -use serde::de::DeserializeOwned; -use meilidb_core::{Index, QueryBuilder, DocumentId}; - -use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; -use crate::database::serde::deserializer::Deserializer; -use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::database::schema::Schema; -use crate::database::RankedMap; -use crate::database::Config; - -pub struct DatabaseView -where D: Deref -{ - snapshot: Snapshot, - index: Index, - ranked_map: RankedMap, - schema: Schema, - config: Config, -} - -impl DatabaseView -where D: Deref -{ - pub fn new(snapshot: Snapshot) -> Result, Box> { - let schema = retrieve_data_schema(&snapshot)?; - let index = retrieve_data_index(&snapshot)?; - let ranked_map = retrieve_data_ranked_map(&snapshot)?; - let config = retrieve_config(&snapshot)?; - Ok(DatabaseView { snapshot, index, ranked_map, schema, config }) - } - - pub fn schema(&self) -> &Schema { - &self.schema - } - - pub fn index(&self) -> &Index { - &self.index - } - - pub fn ranked_map(&self) -> &RankedMap { - &self.ranked_map - } - - pub fn into_snapshot(self) -> Snapshot { - self.snapshot - } - - pub fn snapshot(&self) -> &Snapshot { - &self.snapshot - } - - pub fn config(&self) -> &Config { - &self.config - } - - pub fn get(&self, key: &[u8]) -> Result, Box> { - Ok(self.snapshot.get(key)?) - } - - pub fn dump_all>(&self, path: P) -> Result<(), Box> { - let path = path.as_ref().to_string_lossy(); - - let env_options = EnvOptions::new(); - let column_family_options = ColumnFamilyOptions::new(); - let mut file_writer = SstFileWriter::new(env_options, column_family_options); - file_writer.open(&path)?; - - let mut iter = self.snapshot.iter(); - iter.seek(SeekKey::Start); - - for (key, value) in &mut iter { - file_writer.put(&key, &value)?; - } - - file_writer.finish()?; - Ok(()) - } - - pub fn query_builder(&self) -> QueryBuilder { - QueryBuilder::new(self.index()) - } - - pub fn raw_field_by_document_id( - &self, - name: &str, - id: DocumentId - ) -> Result>, Box> - { - let attr = self.schema.attribute(name).ok_or("field not found")?; - let key = DocumentKeyAttr::new(id, attr); - let vector = self.snapshot.get(key.as_ref())?; - - Ok(vector.map(|v| v.to_vec())) - } - - pub fn document_by_id(&self, id: DocumentId) -> Result> - where T: DeserializeOwned, - { - let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id); - Ok(T::deserialize(&mut deserializer)?) - } - - pub fn documents_by_id(&self, ids: I) -> DocumentIter - where T: DeserializeOwned, - I: IntoIterator, - { - DocumentIter { - database_view: self, - document_ids: ids.into_iter(), - _phantom: marker::PhantomData, - } - } -} - -impl fmt::Debug for DatabaseView -where D: Deref -{ - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(DocumentId(0)); - options.set_iterate_lower_bound(lower.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key)); - - if f.alternate() { - writeln!(f, "DatabaseView(")?; - } else { - write!(f, "DatabaseView(")?; - } - - self.schema.fmt(f)?; - - if f.alternate() { - writeln!(f, ",")?; - } else { - write!(f, ", ")?; - } - - f.debug_list().entries(iter).finish()?; - - write!(f, ")") - } -} - -// TODO this is just an iter::Map !!! -pub struct DocumentIter<'a, D, T, I> -where D: Deref -{ - database_view: &'a DatabaseView, - document_ids: I, - _phantom: marker::PhantomData, -} - -impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: Iterator, -{ - type Item = Result>; - - fn size_hint(&self) -> (usize, Option) { - self.document_ids.size_hint() - } - - fn next(&mut self) -> Option { - match self.document_ids.next() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} - -impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: ExactSizeIterator + Iterator, -{ } - -impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: DoubleEndedIterator + Iterator, -{ - fn next_back(&mut self) -> Option { - match self.document_ids.next_back() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs index 325df65eb..aba7ab6a7 100644 --- a/meilidb/src/lib.rs +++ b/meilidb/src/lib.rs @@ -1,10 +1,7 @@ #![cfg_attr(feature = "nightly", feature(test))] -pub mod database; mod common_words; mod sort_by_attr; -pub use rocksdb; - pub use self::sort_by_attr::SortByAttr; pub use self::common_words::CommonWords;