From 3dc057ca9cb679bd5354c3ebf9248fb330ce5e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 18:27:57 +0200 Subject: [PATCH 01/32] feat: Introduce the new Index system --- meilidb-core/Cargo.toml | 1 + meilidb-core/src/index.rs | 227 ++++++++++++------------------ meilidb-core/src/lib.rs | 8 +- meilidb-core/src/query_builder.rs | 32 +++-- meilidb-data/Cargo.toml | 7 +- meilidb-data/src/database.rs | 61 ++++---- meilidb-data/src/index_event.rs | 16 +-- meilidb-data/src/indexer.rs | 85 +++++++++-- 8 files changed, 235 insertions(+), 202 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 16bc204d4..88363cbad 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -14,6 +14,7 @@ rayon = "1.0.3" sdset = "0.3.1" serde = { version = "1.0.88", features = ["derive"] } slice-group-by = "0.2.4" +zerocopy = "0.2.2" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" diff --git a/meilidb-core/src/index.rs b/meilidb-core/src/index.rs index 44a76d359..87b59e682 100644 --- a/meilidb-core/src/index.rs +++ b/meilidb-core/src/index.rs @@ -1,175 +1,134 @@ -use std::error::Error; - -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use fst::{map, Map, IntoStreamer, Streamer}; -use fst::raw::Fst; +use std::collections::BTreeMap; +use fst::{set, IntoStreamer, Streamer}; +use sdset::{Set, SetBuf, SetOperation}; use sdset::duo::{Union, DifferenceByKey}; -use sdset::{Set, SetOperation}; +use crate::{DocIndex, DocumentId}; -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::data::{DocIndexes, DocIndexesBuilder}; -use crate::{DocumentId, DocIndex}; +pub type Word = Vec; // TODO should be a smallvec -#[derive(Default)] -pub struct Index { - pub map: Map, - pub indexes: DocIndexes, +pub trait Store: Clone { + type Error: std::error::Error; + + fn get_fst(&self) -> Result; + fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error>; + + fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error>; + fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error>; + fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error>; } -impl Index { - pub fn remove_documents(&self, documents: &Set) -> Index { +pub struct Index { + pub set: fst::Set, + pub store: S, +} + +impl Index +where S: Store, +{ + pub fn from_store(store: S) -> Result, S::Error> { + let set = store.get_fst()?; + Ok(Index { set, store }) + } + + pub fn remove_documents(&self, documents: &Set) -> Result, S::Error> { let mut buffer = Vec::new(); - let mut builder = IndexBuilder::new(); + let mut builder = fst::SetBuilder::memory(); let mut stream = self.into_stream(); - while let Some((key, indexes)) = stream.next() { - buffer.clear(); + while let Some((input, result)) = stream.next() { + let indexes = match result? { + Some(indexes) => indexes, + None => continue, + }; - let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x); + let op = DifferenceByKey::new(&indexes, documents, |x| x.document_id, |x| *x); + buffer.clear(); op.extend_vec(&mut buffer); - if !buffer.is_empty() { + if buffer.is_empty() { + self.store.del_indexes(input)?; + } else { + builder.insert(input).unwrap(); let indexes = Set::new_unchecked(&buffer); - builder.insert(key, indexes).unwrap(); + self.store.set_indexes(input, indexes)?; } } - builder.build() + let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); + self.store.set_fst(&set)?; + + Ok(Index { set, store: self.store.clone() }) } - pub fn union(&self, other: &Index) -> Index { - let mut builder = IndexBuilder::new(); - let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union(); - + pub fn insert_indexes(&self, map: BTreeMap>) -> Result, S::Error> { let mut buffer = Vec::new(); - while let Some((key, ivalues)) = stream.next() { - buffer.clear(); - match ivalues { - [a, b] => { - let indexes = if a.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[a.value as usize]; - let a = Set::new_unchecked(indexes); + let mut builder = fst::SetBuilder::memory(); + let set = fst::Set::from_iter(map.keys()).unwrap(); + let mut union_ = self.set.op().add(&set).r#union(); - let indexes = if b.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[b.value as usize]; - let b = Set::new_unchecked(indexes); + while let Some(input) = union_.next() { + let remote = self.store.get_indexes(input)?; + let locale = map.get(input); - let op = Union::new(a, b); - op.extend_vec(&mut buffer); + match (remote, locale) { + (Some(remote), Some(locale)) => { + buffer.clear(); + Union::new(&remote, &locale).extend_vec(&mut buffer); + let indexes = Set::new_unchecked(&buffer); + + if !indexes.is_empty() { + self.store.set_indexes(input, indexes)?; + builder.insert(input).unwrap(); + } else { + self.store.del_indexes(input)?; + } }, - [x] => { - let indexes = if x.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[x.value as usize]; - buffer.extend_from_slice(indexes) + (None, Some(locale)) => { + self.store.set_indexes(input, &locale)?; + builder.insert(input).unwrap(); }, - _ => continue, - } - - if !buffer.is_empty() { - let indexes = Set::new_unchecked(&buffer); - builder.insert(key, indexes).unwrap(); + (Some(_), None) => { + builder.insert(input).unwrap(); + }, + (None, None) => unreachable!(), } } - builder.build() + let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); + self.store.set_fst(&set)?; + + Ok(Index { set, store: self.store.clone() }) } } -impl FromSharedDataCursor for Index { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - let len = cursor.read_u64::()? as usize; - let data = cursor.extract(len); - - let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?; - let map = Map::from(fst); - - let indexes = DocIndexes::from_shared_data_cursor(cursor)?; - - Ok(Index { map, indexes}) - } +pub struct Stream<'m, S> { + set_stream: set::Stream<'m>, + store: &'m S, } -impl WriteToBytes for Index { - fn write_to_bytes(&self, bytes: &mut Vec) { - let slice = self.map.as_fst().as_bytes(); - let len = slice.len() as u64; - let _ = bytes.write_u64::(len); - bytes.extend_from_slice(slice); - - self.indexes.write_to_bytes(bytes); - } -} - -impl<'m, 'a> IntoStreamer<'a> for &'m Index { - type Item = (&'a [u8], &'a Set); - type Into = Stream<'m>; - - fn into_stream(self) -> Self::Into { - Stream { - map_stream: self.map.into_stream(), - indexes: &self.indexes, - } - } -} - -pub struct Stream<'m> { - map_stream: map::Stream<'m>, - indexes: &'m DocIndexes, -} - -impl<'m, 'a> Streamer<'a> for Stream<'m> { - type Item = (&'a [u8], &'a Set); +impl<'m, 'a, S> Streamer<'a> for Stream<'m, S> +where S: 'a + Store, +{ + type Item = (&'a [u8], Result>, S::Error>); fn next(&'a mut self) -> Option { - match self.map_stream.next() { - Some((input, index)) => { - let indexes = &self.indexes[index as usize]; - let indexes = Set::new_unchecked(indexes); - Some((input, indexes)) - }, + match self.set_stream.next() { + Some(input) => Some((input, self.store.get_indexes(input))), None => None, } } } -pub struct IndexBuilder { - map: fst::MapBuilder>, - indexes: DocIndexesBuilder>, - value: u64, -} +impl<'m, 'a, S> IntoStreamer<'a> for &'m Index +where S: 'a + Store, +{ + type Item = (&'a [u8], Result>, S::Error>); + type Into = Stream<'m, S>; -impl IndexBuilder { - pub fn new() -> Self { - IndexBuilder { - map: fst::MapBuilder::memory(), - indexes: DocIndexesBuilder::memory(), - value: 0, + fn into_stream(self) -> Self::Into { + Stream { + set_stream: self.set.into_stream(), + store: &self.store, } } - - /// If a key is inserted that is less than or equal to any previous key added, - /// then an error is returned. Similarly, if there was a problem writing - /// to the underlying writer, an error is returned. - // FIXME what if one write doesn't work but the other do ? - pub fn insert(&mut self, key: K, indexes: &Set) -> fst::Result<()> - where K: AsRef<[u8]>, - { - self.map.insert(key, self.value)?; - self.indexes.insert(indexes); - self.value += 1; - Ok(()) - } - - pub fn build(self) -> Index { - let map = self.map.into_inner().unwrap(); - let indexes = self.indexes.into_inner().unwrap(); - - let map = Map::from_bytes(map).unwrap(); - let indexes = DocIndexes::from_bytes(indexes).unwrap(); - - Index { map, indexes } - } } diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 18e9a99cc..5bff103cd 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -13,16 +13,19 @@ use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; +use zerocopy::{AsBytes, FromBytes}; -pub use self::index::{Index, IndexBuilder}; +pub use self::index::{Index, Store}; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. /// Helpful for custom ranking. -#[derive(Serialize, Deserialize)] #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[derive(Serialize, Deserialize)] +#[derive(AsBytes, FromBytes)] +#[repr(C)] pub struct DocumentId(pub u64); /// This structure represent the position of a word @@ -31,6 +34,7 @@ pub struct DocumentId(pub u64); /// This is stored in the map, generated at index time, /// extracted and interpreted at search time. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(AsBytes, FromBytes)] #[repr(C)] pub struct DocIndex { /// The document identifier where the word was found. diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index ad7de9c15..40e172403 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -14,8 +14,8 @@ use log::info; use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; -use crate::{raw_documents_from_matches, RawDocument, Document}; -use crate::{Index, Match, DocumentId}; +use crate::raw_documents_from_matches; +use crate::{Match, DocumentId, Index, Store, RawDocument, Document}; fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -82,16 +82,18 @@ impl<'c, I, FI> QueryBuilder<'c, I, FI> } } -impl<'c, I, FI> QueryBuilder<'c, I, FI> -where I: Deref, +impl<'c, I, FI, S> QueryBuilder<'c, I, FI> +where I: Deref>, + S: Store, { fn query_all(&self, query: &str) -> Vec { let automatons = generate_automatons(query); + let fst = self.index.set.as_fst(); let mut stream = { - let mut op_builder = fst::map::OpBuilder::new(); + let mut op_builder = fst::raw::OpBuilder::new(); for automaton in &automatons { - let stream = self.index.map.search(automaton); + let stream = fst.search(automaton); op_builder.push(stream); } op_builder.r#union() @@ -105,10 +107,12 @@ where I: Deref, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - let doc_indexes = &self.index.indexes; - let doc_indexes = &doc_indexes[iv.value as usize]; + // let doc_indexes = &self.index.indexes; + // let doc_indexes = &doc_indexes[iv.value as usize]; - for di in doc_indexes { + let doc_indexes = self.index.store.get_indexes(input).unwrap().unwrap(); + + for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { query_index: iv.index as u32, @@ -135,9 +139,10 @@ where I: Deref, } } -impl<'c, I, FI> QueryBuilder<'c, I, FI> -where I: Deref, +impl<'c, I, FI, S> QueryBuilder<'c, I, FI> +where I: Deref>, FI: Fn(DocumentId) -> bool, + S: Store, { pub fn query(self, query: &str, range: Range) -> Vec { // We delegate the filter work to the distinct query builder, @@ -212,11 +217,12 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } } -impl<'c, I, FI, FD, K> DistinctQueryBuilder<'c, I, FI, FD> -where I: Deref, +impl<'c, I, FI, FD, K, S> DistinctQueryBuilder<'c, I, FI, FD> +where I: Deref>, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, + S: Store, { pub fn query(self, query: &str, range: Range) -> Vec { let start = Instant::now(); diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 6096e4ad3..c883196d4 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" arc-swap = "0.3.11" bincode = "1.1.2" byteorder = "1.3.1" +deunicode = "1.0.0" hashbrown = { version = "0.2.2", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } @@ -18,8 +19,12 @@ serde = { version = "1.0.90", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.23.0" toml = { version = "0.5.0", features = ["preserve_order"] } -deunicode = "1.0.0" +zerocopy = "0.2.2" [dependencies.rmp-serde] git = "https://github.com/3Hren/msgpack-rust.git" rev = "40b3d48" + +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index da43dd4bc..f66d31aa9 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -12,7 +12,7 @@ use meilidb_core::criterion::Criteria; use meilidb_core::QueryBuilder; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::{DocumentId, Index as WordIndex}; +use meilidb_core::DocumentId; use rmp_serde::decode::{Error as RmpError}; use sdset::SetBuf; use serde::de; @@ -20,7 +20,9 @@ use sled::IVec; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; -use crate::indexer::Indexer; +use crate::indexer::{Indexer, WordIndexTree}; + +pub type WordIndex = meilidb_core::Index; #[derive(Debug)] pub enum Error { @@ -72,6 +74,10 @@ fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } +fn word_index_name(name: &str) -> Vec { + format!("word-index-{}", name).into_bytes() +} + fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { let DocumentId(document_id) = id; let SchemaAttr(schema_attr) = attr; @@ -136,7 +142,8 @@ impl Database { let raw_name = index_name(name); if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { let tree = self.inner.open_tree(raw_name)?; - let raw_index = RawIndex::from_raw(tree)?; + let word_index_tree = self.inner.open_tree(word_index_name(name))?; + let raw_index = RawIndex::from_raw(tree, word_index_tree)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); @@ -162,7 +169,8 @@ impl Database { None => { let raw_name = index_name(&name); let tree = self.inner.open_tree(raw_name)?; - let raw_index = RawIndex::new_from_raw(tree, schema)?; + let word_index_tree = self.inner.open_tree(word_index_name(&name))?; + let raw_index = RawIndex::new_from_raw(tree, word_index_tree, schema)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); @@ -185,25 +193,16 @@ pub struct RawIndex { } impl RawIndex { - fn from_raw(inner: Arc) -> Result { + fn from_raw(inner: Arc, word_index: Arc) -> Result { let schema = { let bytes = inner.get("schema")?; let bytes = bytes.ok_or(Error::SchemaMissing)?; Schema::read_from_bin(bytes.as_ref())? }; - let bytes = inner.get("word-index")?; - let bytes = bytes.ok_or(Error::WordIndexMissing)?; - let word_index = { - let len = bytes.len(); - let bytes: Arc<[u8]> = Into::into(bytes); - let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len); - - // TODO must handle this error - let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap(); - - Arc::new(ArcSwap::new(Arc::new(word_index))) - }; + let store = WordIndexTree(word_index); + let word_index = WordIndex::from_store(store)?; + let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); let ranked_map = { let map = match inner.get("ranked-map")? { @@ -217,13 +216,18 @@ impl RawIndex { Ok(RawIndex { schema, word_index, ranked_map, inner }) } - fn new_from_raw(inner: Arc, schema: Schema) -> Result { + fn new_from_raw( + inner: Arc, + word_index: Arc, + schema: Schema, + ) -> Result + { let mut schema_bytes = Vec::new(); schema.write_to_bin(&mut schema_bytes)?; inner.set("schema", schema_bytes)?; - let word_index = WordIndex::default(); - inner.set("word-index", word_index.into_bytes())?; + let store = WordIndexTree(word_index); + let word_index = WordIndex::from_store(store)?; let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); @@ -243,12 +247,8 @@ impl RawIndex { self.ranked_map.lease() } - pub fn update_word_index(&self, word_index: Arc) -> sled::Result<()> { - let data = word_index.into_bytes(); - self.inner.set("word-index", data).map(drop)?; - self.word_index.store(word_index); - - Ok(()) + pub fn update_word_index(&self, word_index: Arc) { + self.word_index.store(word_index) } pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { @@ -417,14 +417,15 @@ impl DocumentsAddition { Ok(()) } + pub fn finalize(self) -> sled::Result<()> { let delta_index = self.indexer.build(); let index = self.inner.word_index(); - let new_index = index.r#union(&delta_index); + let new_index = index.insert_indexes(delta_index)?; let new_index = Arc::from(new_index); - self.inner.update_word_index(new_index)?; + self.inner.update_word_index(new_index); Ok(()) } @@ -454,10 +455,10 @@ impl DocumentsDeletion { let idset = SetBuf::new_unchecked(self.documents); let index = self.inner.word_index(); - let new_index = index.remove_documents(&idset); + let new_index = index.remove_documents(&idset)?; let new_index = Arc::from(new_index); - self.inner.update_word_index(new_index)?; + self.inner.update_word_index(new_index); Ok(()) } diff --git a/meilidb-data/src/index_event.rs b/meilidb-data/src/index_event.rs index 40d54cbf3..3a63295c9 100644 --- a/meilidb-data/src/index_event.rs +++ b/meilidb-data/src/index_event.rs @@ -7,12 +7,12 @@ use meilidb_core::data::DocIds; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -enum NewIndexEvent<'a> { +enum NewIndexEvent<'a, S> { RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a WordIndex), + UpdatedDocuments(&'a WordIndex), } -impl<'a> WriteToBytes for NewIndexEvent<'a> { +impl<'a, S> WriteToBytes for NewIndexEvent<'a, S> { fn write_to_bytes(&self, bytes: &mut Vec) { match self { NewIndexEvent::RemovedDocuments(doc_ids) => { @@ -21,24 +21,24 @@ impl<'a> WriteToBytes for NewIndexEvent<'a> { }, NewIndexEvent::UpdatedDocuments(index) => { let _ = bytes.write_u8(1); - index.write_to_bytes(bytes); + // index.write_to_bytes(bytes); } } } } -enum IndexEvent { +enum IndexEvent { RemovedDocuments(DocIds), - UpdatedDocuments(WordIndex), + UpdatedDocuments(WordIndex), } -impl FromSharedDataCursor for IndexEvent { +impl FromSharedDataCursor for IndexEvent { type Error = Box; fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { match cursor.read_u8()? { 0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments), - 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), + // 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), _ => Err("invalid index event type".into()), } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 3cfd8f722..3405a79de 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -1,14 +1,78 @@ use std::collections::BTreeMap; use std::convert::TryFrom; +use std::sync::Arc; use deunicode::deunicode_with_tofu; -use meilidb_core::{DocumentId, DocIndex}; -use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; +use meilidb_core::{DocumentId, DocIndex, Store}; use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; -use sdset::Set; +use sdset::{Set, SetBuf}; +use sled::Tree; +use zerocopy::{AsBytes, LayoutVerified}; use crate::SchemaAttr; +#[derive(Clone)] +pub struct WordIndexTree(pub Arc); + +impl Store for WordIndexTree { + type Error = sled::Error; + + fn get_fst(&self) -> Result { + match self.0.get("fst")? { + Some(bytes) => { + let bytes: Arc<[u8]> = bytes.into(); + let len = bytes.len(); + let raw = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); + Ok(fst::Set::from(raw)) + }, + None => Ok(fst::Set::default()), + } + } + + fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error> { + let bytes = set.as_fst().to_vec(); + self.0.set("fst", bytes)?; + Ok(()) + } + + fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + match self.0.get(word_bytes)? { + Some(bytes) => { + let layout = LayoutVerified::new_slice(bytes.as_ref()).unwrap(); + let slice = layout.into_slice(); + let setbuf = SetBuf::new_unchecked(slice.to_vec()); + Ok(Some(setbuf)) + }, + None => Ok(None), + } + } + + fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + let slice = indexes.as_slice(); + let bytes = slice.as_bytes(); + + self.0.set(word_bytes, bytes)?; + + Ok(()) + } + + fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + self.0.del(word_bytes)?; + + Ok(()) + } + +} + type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { @@ -48,18 +112,11 @@ impl Indexer { } } - pub fn build(self) -> WordIndex { - let mut builder = WordIndexBuilder::new(); - - for (key, mut indexes) in self.indexed { + pub fn build(self) -> BTreeMap> { + self.indexed.into_iter().map(|(word, mut indexes)| { indexes.sort_unstable(); - indexes.dedup(); - - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes).unwrap(); - } - - builder.build() + (word, SetBuf::new_unchecked(indexes)) + }).collect() } } From 74acf83464d688f435a28a72e1fbd7dbfab57315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 25 Apr 2019 11:44:38 +0200 Subject: [PATCH 02/32] chore: Remove the NewIndexEvent type --- meilidb-data/src/index_event.rs | 45 --------------------------------- meilidb-data/src/lib.rs | 1 - 2 files changed, 46 deletions(-) delete mode 100644 meilidb-data/src/index_event.rs diff --git a/meilidb-data/src/index_event.rs b/meilidb-data/src/index_event.rs deleted file mode 100644 index 3a63295c9..000000000 --- a/meilidb-data/src/index_event.rs +++ /dev/null @@ -1,45 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; - -use meilidb_core::{Index as WordIndex}; -use meilidb_core::data::DocIds; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; - -enum NewIndexEvent<'a, S> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a WordIndex), -} - -impl<'a, S> WriteToBytes for NewIndexEvent<'a, S> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - NewIndexEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - NewIndexEvent::UpdatedDocuments(index) => { - let _ = bytes.write_u8(1); - // index.write_to_bytes(bytes); - } - } - } -} - -enum IndexEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(WordIndex), -} - -impl FromSharedDataCursor for IndexEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments), - // 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), - _ => Err("invalid index event type".into()), - } - } -} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 542741171..03f09728a 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,5 +1,4 @@ mod database; -mod index_event; mod indexer; mod number; mod ranked_map; From 2e9fbd07cde5ae25a53489a6ee484035e35bc5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 14:50:54 +0200 Subject: [PATCH 03/32] chore: Remove most of the warnings --- meilidb-data/src/database.rs | 6 +----- meilidb-data/src/serde/convert_to_number.rs | 2 +- meilidb-data/src/serde/convert_to_string.rs | 2 +- meilidb-data/src/serde/extract_document_id.rs | 4 ++-- meilidb-data/src/serde/indexer.rs | 3 +-- meilidb-data/src/serde/serializer.rs | 2 +- 6 files changed, 7 insertions(+), 12 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index f66d31aa9..23d998886 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -8,11 +8,7 @@ use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; use byteorder::{ReadBytesExt, BigEndian}; use hashbrown::HashMap; -use meilidb_core::criterion::Criteria; -use meilidb_core::QueryBuilder; -use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::DocumentId; +use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId}; use rmp_serde::decode::{Error as RmpError}; use sdset::SetBuf; use serde::de; diff --git a/meilidb-data/src/serde/convert_to_number.rs b/meilidb-data/src/serde/convert_to_number.rs index cf30e3b62..db888f64e 100644 --- a/meilidb-data/src/serde/convert_to_number.rs +++ b/meilidb-data/src/serde/convert_to_number.rs @@ -24,7 +24,7 @@ impl ser::Serializer for ConvertToNumber { Ok(Number::Unsigned(u64::from(value))) } - fn serialize_char(self, value: char) -> Result { + fn serialize_char(self, _value: char) -> Result { Err(SerializerError::UnrankableType { type_name: "char" }) } diff --git a/meilidb-data/src/serde/convert_to_string.rs b/meilidb-data/src/serde/convert_to_string.rs index 67e592e78..cd109f534 100644 --- a/meilidb-data/src/serde/convert_to_string.rs +++ b/meilidb-data/src/serde/convert_to_string.rs @@ -16,7 +16,7 @@ impl ser::Serializer for ConvertToString { type SerializeStruct = ser::Impossible; type SerializeStructVariant = ser::Impossible; - fn serialize_bool(self, value: bool) -> Result { + fn serialize_bool(self, _value: bool) -> Result { Err(SerializerError::UnserializableType { type_name: "boolean" }) } diff --git a/meilidb-data/src/serde/extract_document_id.rs b/meilidb-data/src/serde/extract_document_id.rs index d7c6bb195..a67ee1441 100644 --- a/meilidb-data/src/serde/extract_document_id.rs +++ b/meilidb-data/src/serde/extract_document_id.rs @@ -56,11 +56,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { f64 => serialize_f64, } - fn serialize_str(self, value: &str) -> Result { + fn serialize_str(self, _value: &str) -> Result { Err(SerializerError::UnserializableType { type_name: "str" }) } - fn serialize_bytes(self, _v: &[u8]) -> Result { + fn serialize_bytes(self, _value: &[u8]) -> Result { Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-data/src/serde/indexer.rs index 8eb0b2c67..f9445dca4 100644 --- a/meilidb-data/src/serde/indexer.rs +++ b/meilidb-data/src/serde/indexer.rs @@ -2,7 +2,6 @@ use meilidb_core::DocumentId; use serde::ser; use serde::Serialize; -use crate::database::RawIndex; use crate::indexer::Indexer as RawIndexer; use crate::schema::SchemaAttr; use super::{SerializerError, ConvertToString}; @@ -24,7 +23,7 @@ impl<'a> ser::Serializer for Indexer<'a> { type SerializeStruct = StructSerializer<'a>; type SerializeStructVariant = ser::Impossible; - fn serialize_bool(self, value: bool) -> Result { + fn serialize_bool(self, _value: bool) -> Result { Err(SerializerError::UnindexableType { type_name: "boolean" }) } diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 37b3c7036..1857fe134 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -4,7 +4,7 @@ use serde::ser; use crate::database::RawIndex; use crate::ranked_map::RankedMap; use crate::indexer::Indexer as RawIndexer; -use crate::schema::{Schema, SchemaAttr}; +use crate::schema::Schema; use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { From db031a5b95084b029a79b11a20a2bfe9c6b87cf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 15:28:26 +0200 Subject: [PATCH 04/32] chore: Remove the DocIds type --- meilidb-core/src/data/doc_ids.rs | 61 -------------------------------- meilidb-core/src/data/mod.rs | 2 -- 2 files changed, 63 deletions(-) delete mode 100644 meilidb-core/src/data/doc_ids.rs diff --git a/meilidb-core/src/data/doc_ids.rs b/meilidb-core/src/data/doc_ids.rs deleted file mode 100644 index ff951bb35..000000000 --- a/meilidb-core/src/data/doc_ids.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::slice::from_raw_parts; -use std::mem::size_of; -use std::error::Error; - -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use sdset::Set; - -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::data::SharedData; -use crate::DocumentId; - -use super::into_u8_slice; - -#[derive(Default, Clone)] -pub struct DocIds(SharedData); - -impl DocIds { - pub fn new(ids: &Set) -> DocIds { - let bytes = unsafe { into_u8_slice(ids.as_slice()) }; - let data = SharedData::from_bytes(bytes.to_vec()); - DocIds(data) - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - pub fn as_bytes(&self) -> &[u8] { - &self.0 - } -} - -impl AsRef> for DocIds { - fn as_ref(&self) -> &Set { - let slice = &self.0; - let ptr = slice.as_ptr() as *const DocumentId; - let len = slice.len() / size_of::(); - let slice = unsafe { from_raw_parts(ptr, len) }; - Set::new_unchecked(slice) - } -} - -impl FromSharedDataCursor for DocIds { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - let len = cursor.read_u64::()? as usize; - let data = cursor.extract(len); - - Ok(DocIds(data)) - } -} - -impl WriteToBytes for DocIds { - fn write_to_bytes(&self, bytes: &mut Vec) { - let len = self.0.len() as u64; - bytes.write_u64::(len).unwrap(); - bytes.extend_from_slice(&self.0); - } -} diff --git a/meilidb-core/src/data/mod.rs b/meilidb-core/src/data/mod.rs index 895f553a6..195a71cdc 100644 --- a/meilidb-core/src/data/mod.rs +++ b/meilidb-core/src/data/mod.rs @@ -1,11 +1,9 @@ -mod doc_ids; mod doc_indexes; mod shared_data; use std::slice::from_raw_parts; use std::mem::size_of; -pub use self::doc_ids::DocIds; pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; pub use self::shared_data::SharedData; From 27ffcaabe97569ee426279400317c1114bd2c4ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 15:31:00 +0200 Subject: [PATCH 05/32] chore: Remove the DocIndexes type --- meilidb-core/src/data/doc_indexes.rs | 231 --------------------------- meilidb-core/src/data/mod.rs | 11 -- 2 files changed, 242 deletions(-) delete mode 100644 meilidb-core/src/data/doc_indexes.rs diff --git a/meilidb-core/src/data/doc_indexes.rs b/meilidb-core/src/data/doc_indexes.rs deleted file mode 100644 index 2bb946745..000000000 --- a/meilidb-core/src/data/doc_indexes.rs +++ /dev/null @@ -1,231 +0,0 @@ -use std::io::{self, Write}; -use std::slice::from_raw_parts; -use std::mem::size_of; -use std::ops::Index; - -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use sdset::Set; - -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::data::SharedData; -use crate::DocIndex; - -use super::into_u8_slice; - -#[derive(Debug)] -#[repr(C)] -struct Range { - start: u64, - end: u64, -} - -#[derive(Clone, Default)] -pub struct DocIndexes { - ranges: SharedData, - indexes: SharedData, -} - -impl DocIndexes { - pub fn get(&self, index: usize) -> Option<&Set> { - self.ranges().get(index).map(|Range { start, end }| { - let start = *start as usize; - let end = *end as usize; - let slice = &self.indexes()[start..end]; - Set::new_unchecked(slice) - }) - } - - fn ranges(&self) -> &[Range] { - let slice = &self.ranges; - let ptr = slice.as_ptr() as *const Range; - let len = slice.len() / size_of::(); - unsafe { from_raw_parts(ptr, len) } - } - - fn indexes(&self) -> &[DocIndex] { - let slice = &self.indexes; - let ptr = slice.as_ptr() as *const DocIndex; - let len = slice.len() / size_of::(); - unsafe { from_raw_parts(ptr, len) } - } -} - -impl Index for DocIndexes { - type Output = [DocIndex]; - - fn index(&self, index: usize) -> &Self::Output { - match self.get(index) { - Some(indexes) => indexes, - None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()), - } - } -} - -impl FromSharedDataCursor for DocIndexes { - type Error = io::Error; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - let len = cursor.read_u64::()? as usize; - let ranges = cursor.extract(len); - - let len = cursor.read_u64::()? as usize; - let indexes = cursor.extract(len); - - Ok(DocIndexes { ranges, indexes }) - } -} - -impl WriteToBytes for DocIndexes { - fn write_to_bytes(&self, bytes: &mut Vec) { - let ranges_len = self.ranges.len() as u64; - let _ = bytes.write_u64::(ranges_len); - bytes.extend_from_slice(&self.ranges); - - let indexes_len = self.indexes.len() as u64; - let _ = bytes.write_u64::(indexes_len); - bytes.extend_from_slice(&self.indexes); - } -} - -pub struct DocIndexesBuilder { - ranges: Vec, - indexes: Vec, - wtr: W, -} - -impl DocIndexesBuilder> { - pub fn memory() -> Self { - DocIndexesBuilder { - ranges: Vec::new(), - indexes: Vec::new(), - wtr: Vec::new(), - } - } -} - -impl DocIndexesBuilder { - pub fn new(wtr: W) -> Self { - DocIndexesBuilder { - ranges: Vec::new(), - indexes: Vec::new(), - wtr: wtr, - } - } - - pub fn insert(&mut self, indexes: &Set) { - let len = indexes.len() as u64; - let start = self.ranges.last().map(|r| r.end).unwrap_or(0); - let range = Range { start, end: start + len }; - self.ranges.push(range); - - self.indexes.extend_from_slice(indexes); - } - - pub fn finish(self) -> io::Result<()> { - self.into_inner().map(drop) - } - - pub fn into_inner(mut self) -> io::Result { - let ranges = unsafe { into_u8_slice(&self.ranges) }; - let len = ranges.len() as u64; - self.wtr.write_u64::(len)?; - self.wtr.write_all(ranges)?; - - let indexes = unsafe { into_u8_slice(&self.indexes) }; - let len = indexes.len() as u64; - self.wtr.write_u64::(len)?; - self.wtr.write_all(indexes)?; - - Ok(self.wtr) - } -} - -#[cfg(test)] -mod tests { - use std::error::Error; - use crate::DocumentId; - use super::*; - - #[test] - fn builder_serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { - document_id: DocumentId(0), - attribute: 3, - word_index: 11, - char_index: 30, - char_length: 4, - }; - let b = DocIndex { - document_id: DocumentId(1), - attribute: 4, - word_index: 21, - char_index: 35, - char_length: 6, - }; - let c = DocIndex { - document_id: DocumentId(2), - attribute: 8, - word_index: 2, - char_index: 89, - char_length: 6, - }; - - let mut builder = DocIndexesBuilder::memory(); - - builder.insert(Set::new(&[a])?); - builder.insert(Set::new(&[a, b, c])?); - builder.insert(Set::new(&[a, c])?); - - let bytes = builder.into_inner()?; - let docs = DocIndexes::from_bytes(bytes)?; - - assert_eq!(docs.get(0), Some(Set::new(&[a])?)); - assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?)); - assert_eq!(docs.get(2), Some(Set::new(&[a, c])?)); - assert_eq!(docs.get(3), None); - - Ok(()) - } - - #[test] - fn serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { - document_id: DocumentId(0), - attribute: 3, - word_index: 11, - char_index: 30, - char_length: 4, - }; - let b = DocIndex { - document_id: DocumentId(1), - attribute: 4, - word_index: 21, - char_index: 35, - char_length: 6, - }; - let c = DocIndex { - document_id: DocumentId(2), - attribute: 8, - word_index: 2, - char_index: 89, - char_length: 6, - }; - - let mut builder = DocIndexesBuilder::memory(); - - builder.insert(Set::new(&[a])?); - builder.insert(Set::new(&[a, b, c])?); - builder.insert(Set::new(&[a, c])?); - - let builder_bytes = builder.into_inner()?; - let docs = DocIndexes::from_bytes(builder_bytes.clone())?; - - let mut bytes = Vec::new(); - docs.write_to_bytes(&mut bytes); - - assert_eq!(builder_bytes, bytes); - - Ok(()) - } -} diff --git a/meilidb-core/src/data/mod.rs b/meilidb-core/src/data/mod.rs index 195a71cdc..cb708088d 100644 --- a/meilidb-core/src/data/mod.rs +++ b/meilidb-core/src/data/mod.rs @@ -1,14 +1,3 @@ -mod doc_indexes; mod shared_data; -use std::slice::from_raw_parts; -use std::mem::size_of; - -pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; pub use self::shared_data::SharedData; - -unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { - let ptr = slice.as_ptr() as *const u8; - let len = slice.len() * size_of::(); - from_raw_parts(ptr, len) -} From f16e0333e47dee34e58e2f83443637344f47e56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 15:32:38 +0200 Subject: [PATCH 06/32] chore: Remove the SharedData/Cursor types --- meilidb-core/src/data/mod.rs | 3 -- meilidb-core/src/data/shared_data.rs | 58 -------------------------- meilidb-core/src/lib.rs | 2 - meilidb-core/src/shared_data_cursor.rs | 56 ------------------------- 4 files changed, 119 deletions(-) delete mode 100644 meilidb-core/src/data/mod.rs delete mode 100644 meilidb-core/src/data/shared_data.rs delete mode 100644 meilidb-core/src/shared_data_cursor.rs diff --git a/meilidb-core/src/data/mod.rs b/meilidb-core/src/data/mod.rs deleted file mode 100644 index cb708088d..000000000 --- a/meilidb-core/src/data/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod shared_data; - -pub use self::shared_data::SharedData; diff --git a/meilidb-core/src/data/shared_data.rs b/meilidb-core/src/data/shared_data.rs deleted file mode 100644 index fd505c6d9..000000000 --- a/meilidb-core/src/data/shared_data.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::sync::Arc; -use std::ops::Deref; - -#[derive(Clone)] -pub struct SharedData { - pub bytes: Arc<[u8]>, - pub offset: usize, - pub len: usize, -} - -impl SharedData { - pub fn from_bytes(vec: Vec) -> SharedData { - let len = vec.len(); - let bytes = Arc::from(vec); - SharedData::new(bytes, 0, len) - } - - pub fn new(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedData { - SharedData { bytes, offset, len } - } - - pub fn as_slice(&self) -> &[u8] { - &self.bytes[self.offset..self.offset + self.len] - } - - pub fn range(&self, offset: usize, len: usize) -> SharedData { - assert!(offset + len <= self.len); - SharedData { - bytes: self.bytes.clone(), - offset: self.offset + offset, - len: len, - } - } -} - -impl Default for SharedData { - fn default() -> SharedData { - SharedData { - bytes: Arc::from(Vec::new()), - offset: 0, - len: 0, - } - } -} - -impl Deref for SharedData { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl AsRef<[u8]> for SharedData { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 5bff103cd..6d527e381 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,11 +1,9 @@ pub mod criterion; -pub mod data; mod index; mod automaton; mod query_builder; mod distinct_map; -pub mod shared_data_cursor; pub mod write_to_bytes; use std::sync::Arc; diff --git a/meilidb-core/src/shared_data_cursor.rs b/meilidb-core/src/shared_data_cursor.rs deleted file mode 100644 index 9eeac472f..000000000 --- a/meilidb-core/src/shared_data_cursor.rs +++ /dev/null @@ -1,56 +0,0 @@ -use std::io::{self, Read, Cursor, BufRead}; -use std::sync::Arc; -use crate::data::SharedData; - -pub struct SharedDataCursor(Cursor); - -impl SharedDataCursor { - pub fn from_bytes(bytes: Vec) -> SharedDataCursor { - let len = bytes.len(); - let bytes = Arc::from(bytes); - - SharedDataCursor::from_shared_bytes(bytes, 0, len) - } - - pub fn from_shared_bytes(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedDataCursor { - let data = SharedData::new(bytes, offset, len); - let cursor = Cursor::new(data); - - SharedDataCursor(cursor) - } - - pub fn extract(&mut self, amt: usize) -> SharedData { - let offset = self.0.position() as usize; - let extracted = self.0.get_ref().range(offset, amt); - self.0.consume(amt); - - extracted - } -} - -impl Read for SharedDataCursor { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.0.read(buf) - } -} - -impl BufRead for SharedDataCursor { - fn fill_buf(&mut self) -> io::Result<&[u8]> { - self.0.fill_buf() - } - - fn consume(&mut self, amt: usize) { - self.0.consume(amt) - } -} - -pub trait FromSharedDataCursor: Sized { - type Error; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result; - - fn from_bytes(bytes: Vec) -> Result { - let mut cursor = SharedDataCursor::from_bytes(bytes); - Self::from_shared_data_cursor(&mut cursor) - } -} From 737db5668b4aec61b18b79088c35b8f83383fc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 15:33:14 +0200 Subject: [PATCH 07/32] chore: Remove the WriteToBytes trait --- meilidb-core/src/lib.rs | 2 -- meilidb-core/src/write_to_bytes.rs | 9 --------- 2 files changed, 11 deletions(-) delete mode 100644 meilidb-core/src/write_to_bytes.rs diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 6d527e381..783740b1b 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -4,8 +4,6 @@ mod automaton; mod query_builder; mod distinct_map; -pub mod write_to_bytes; - use std::sync::Arc; use serde::{Serialize, Deserialize}; diff --git a/meilidb-core/src/write_to_bytes.rs b/meilidb-core/src/write_to_bytes.rs deleted file mode 100644 index 4837eb6a3..000000000 --- a/meilidb-core/src/write_to_bytes.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub trait WriteToBytes { - fn write_to_bytes(&self, bytes: &mut Vec); - - fn into_bytes(&self) -> Vec { - let mut bytes = Vec::new(); - self.write_to_bytes(&mut bytes); - bytes - } -} From 6eb25687f8ad48d88dd429e96ca66d6f9712c2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 2 May 2019 12:10:54 +0200 Subject: [PATCH 08/32] feat: Handle word doc-indexes sled tree errors --- meilidb-core/src/query_builder.rs | 22 ++++++++++------------ meilidb/examples/query-database.rs | 2 +- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 40e172403..b5ff3a530 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -86,7 +86,7 @@ impl<'c, I, FI, S> QueryBuilder<'c, I, FI> where I: Deref>, S: Store, { - fn query_all(&self, query: &str) -> Vec { + fn query_all(&self, query: &str) -> Result, S::Error> { let automatons = generate_automatons(query); let fst = self.index.set.as_fst(); @@ -107,10 +107,8 @@ where I: Deref>, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - // let doc_indexes = &self.index.indexes; - // let doc_indexes = &doc_indexes[iv.value as usize]; - - let doc_indexes = self.index.store.get_indexes(input).unwrap().unwrap(); + let doc_indexes = self.index.store.get_indexes(input)?; + let doc_indexes = doc_indexes.expect("word doc-indexes not found"); for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { @@ -135,7 +133,7 @@ where I: Deref>, info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); - raw_documents + Ok(raw_documents) } } @@ -144,7 +142,7 @@ where I: Deref>, FI: Fn(DocumentId) -> bool, S: Store, { - pub fn query(self, query: &str, range: Range) -> Vec { + pub fn query(self, query: &str, range: Range) -> Result, S::Error> { // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. if self.filter.is_some() { @@ -153,7 +151,7 @@ where I: Deref>, } let start = Instant::now(); - let mut documents = self.query_all(query); + let mut documents = self.query_all(query)?; info!("query_all took {:.2?}", start.elapsed()); let mut groups = vec![documents.as_mut_slice()]; @@ -190,7 +188,7 @@ where I: Deref>, let offset = cmp::min(documents.len(), range.start); let iter = documents.into_iter().skip(offset).take(range.len()); - iter.map(|d| Document::from_raw(&d)).collect() + Ok(iter.map(|d| Document::from_raw(&d)).collect()) } } @@ -224,9 +222,9 @@ where I: Deref>, K: Hash + Eq, S: Store, { - pub fn query(self, query: &str, range: Range) -> Vec { + pub fn query(self, query: &str, range: Range) -> Result, S::Error> { let start = Instant::now(); - let mut documents = self.inner.query_all(query); + let mut documents = self.inner.query_all(query)?; info!("query_all took {:.2?}", start.elapsed()); let mut groups = vec![documents.as_mut_slice()]; @@ -324,6 +322,6 @@ where I: Deref>, } } - out_documents + Ok(out_documents) } } diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index 6b048cc5b..1be27dce1 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -161,7 +161,7 @@ fn main() -> Result<(), Box> { let start_total = Instant::now(); let builder = index.query_builder(); - let documents = builder.query(query, 0..opt.number_results); + let documents = builder.query(query, 0..opt.number_results)?; let mut retrieve_duration = Duration::default(); From 0c180262402c27214ec1c974a22bc88f9d28b4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 6 May 2019 14:13:09 +0200 Subject: [PATCH 09/32] feat: Introduce Tree wrappers for each index component --- meilidb-data/src/database.rs | 126 +++++++++++++++++++++---- meilidb-data/src/document_attr_key.rs | 69 ++++++++++++++ meilidb-data/src/lib.rs | 1 + meilidb-data/src/schema.rs | 14 +-- meilidb-data/src/serde/deserializer.rs | 2 +- 5 files changed, 186 insertions(+), 26 deletions(-) create mode 100644 meilidb-data/src/document_attr_key.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 23d998886..667baf2e0 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,4 +1,5 @@ use std::collections::HashSet; +use std::convert::TryInto; use std::io::{self, Cursor, BufRead}; use std::iter::FromIterator; use std::path::Path; @@ -8,15 +9,17 @@ use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; use byteorder::{ReadBytesExt, BigEndian}; use hashbrown::HashMap; -use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId}; +use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId, DocIndex}; use rmp_serde::decode::{Error as RmpError}; use sdset::SetBuf; use serde::de; use sled::IVec; +use zerocopy::{AsBytes, LayoutVerified}; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; use crate::indexer::{Indexer, WordIndexTree}; +use crate::document_attr_key::DocumentAttrKey; pub type WordIndex = meilidb_core::Index; @@ -27,6 +30,7 @@ pub enum Error { WordIndexMissing, MissingDocumentId, SledError(sled::Error), + FstError(fst::Error), BincodeError(bincode::Error), SerializerError(SerializerError), } @@ -37,6 +41,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::FstError(error) + } +} + impl From for Error { fn from(error: bincode::Error) -> Error { Error::BincodeError(error) @@ -58,6 +68,7 @@ impl fmt::Display for Error { WordIndexMissing => write!(f, "this index does not have a word index"), MissingDocumentId => write!(f, "document id is missing"), SledError(e) => write!(f, "sled error; {}", e), + FstError(e) => write!(f, "fst error; {}", e), BincodeError(e) => write!(f, "bincode error; {}", e), SerializerError(e) => write!(f, "serializer error; {}", e), } @@ -180,6 +191,102 @@ impl Database { } } +struct RawIndex2 { + main: MainIndex, + words: WordsIndex, + documents: DocumentsIndex, +} + +struct MainIndex(Arc); + +impl MainIndex { + fn schema(&self) -> Result, Error> { + match self.0.get("schema")? { + Some(bytes) => { + let schema = Schema::read_from_bin(bytes.as_ref())?; + Ok(Some(schema)) + }, + None => Ok(None), + } + } + + fn words_set(&self) -> Result, Error> { + match self.0.get("words")? { + Some(bytes) => { + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + fn ranked_map(&self) -> Result, Error> { + match self.0.get("ranked-map")? { + Some(bytes) => { + let ranked_map = bincode::deserialize(bytes.as_ref())?; + Ok(Some(ranked_map)) + }, + None => Ok(None), + } + } +} + +struct WordsIndex(Arc); + +impl WordsIndex { + fn doc_indexes(&self, word: &[u8]) -> Result>, Error> { + match self.0.get(word)? { + Some(bytes) => { + let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout"); + let slice = layout.into_slice(); + let setbuf = SetBuf::new_unchecked(slice.to_vec()); + Ok(Some(setbuf)) + }, + None => Ok(None), + } + } +} + +struct DocumentsIndex(Arc); + +impl DocumentsIndex { + fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.get(key).map_err(Into::into) + } + + fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + let start = DocumentAttrKey::new(id, SchemaAttr::min()); + let start = start.to_be_bytes(); + + let end = DocumentAttrKey::new(id, SchemaAttr::max()); + let end = end.to_be_bytes(); + + DocumentFieldsIter(self.0.range(start..=end)) + } +} + +pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentFieldsIter<'a> { + type Item = Result<(SchemaAttr, IVec), Error>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, value))) => { + let slice: &[u8] = key.as_ref(); + let array = slice.try_into().unwrap(); + let key = DocumentAttrKey::from_be_bytes(array); + Some(Ok((key.attribute, value))) + }, + Some(Err(e)) => Some(Err(Error::SledError(e))), + None => None, + } + } +} + #[derive(Clone)] pub struct RawIndex { schema: Schema, @@ -294,23 +401,6 @@ impl RawIndex { } } -pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); - -impl<'a> Iterator for DocumentFieldsIter<'a> { - type Item = Result<(DocumentId, SchemaAttr, IVec), Error>; - - fn next(&mut self) -> Option { - match self.0.next() { - Some(Ok((key, value))) => { - let (id, attr) = extract_document_key(key).unwrap(); - Some(Ok((id, attr, value))) - }, - Some(Err(e)) => Some(Err(Error::SledError(e))), - None => None, - } - } -} - #[derive(Clone)] pub struct Index(RawIndex); diff --git a/meilidb-data/src/document_attr_key.rs b/meilidb-data/src/document_attr_key.rs new file mode 100644 index 000000000..d8cb0bc8d --- /dev/null +++ b/meilidb-data/src/document_attr_key.rs @@ -0,0 +1,69 @@ +use meilidb_core::DocumentId; +use crate::schema::SchemaAttr; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct DocumentAttrKey { + pub document_id: DocumentId, + pub attribute: SchemaAttr, +} + +impl DocumentAttrKey { + pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey { + DocumentAttrKey { document_id, attribute } + } + + pub fn to_be_bytes(self) -> [u8; 10] { + let mut output = [0u8; 10]; + + let document_id = self.document_id.0.to_be_bytes(); + let attribute = self.attribute.0.to_be_bytes(); + + unsafe { + use std::{mem::size_of, ptr::copy_nonoverlapping}; + + let output = output.as_mut_ptr(); + copy_nonoverlapping(document_id.as_ptr(), output, size_of::()); + + let output = output.add(size_of::()); + copy_nonoverlapping(attribute.as_ptr(), output, size_of::()); + } + + output + } + + pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey { + let document_id; + let attribute; + + unsafe { + use std::ptr::read_unaligned; + + let pointer = bytes.as_ptr() as *const _; + let document_id_bytes = read_unaligned(pointer); + document_id = u64::from_be_bytes(document_id_bytes); + + let pointer = pointer.add(1) as *const _; + let attribute_bytes = read_unaligned(pointer); + attribute = u16::from_be_bytes(attribute_bytes); + } + + DocumentAttrKey { + document_id: DocumentId(document_id), + attribute: SchemaAttr(attribute), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn to_from_be_bytes() { + let document_id = DocumentId(67578308); + let schema_attr = SchemaAttr(3456); + let x = DocumentAttrKey::new(document_id, schema_attr); + + assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes())); + } +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 03f09728a..149a65fa8 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,4 +1,5 @@ mod database; +mod document_attr_key; mod indexer; mod number; mod ranked_map; diff --git a/meilidb-data/src/schema.rs b/meilidb-data/src/schema.rs index bff7806dc..cc39cf0e8 100644 --- a/meilidb-data/src/schema.rs +++ b/meilidb-data/src/schema.rs @@ -186,12 +186,16 @@ impl Schema { pub struct SchemaAttr(pub u16); impl SchemaAttr { - pub fn new(value: u16) -> SchemaAttr { + pub const fn new(value: u16) -> SchemaAttr { SchemaAttr(value) } - pub fn min() -> SchemaAttr { - SchemaAttr(0) + pub const fn min() -> SchemaAttr { + SchemaAttr(u16::min_value()) + } + + pub const fn max() -> SchemaAttr { + SchemaAttr(u16::max_value()) } pub fn next(self) -> Option { @@ -201,10 +205,6 @@ impl SchemaAttr { pub fn prev(self) -> Option { self.0.checked_sub(1).map(SchemaAttr) } - - pub fn max() -> SchemaAttr { - SchemaAttr(u16::MAX) - } } impl fmt::Display for SchemaAttr { diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-data/src/serde/deserializer.rs index 12873713b..dbebd8abf 100644 --- a/meilidb-data/src/serde/deserializer.rs +++ b/meilidb-data/src/serde/deserializer.rs @@ -45,7 +45,7 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> }, } }); - let iter = document_attributes.filter_map(|(_, attr, value)| { + let iter = document_attributes.filter_map(|(attr, value)| { if self.fields.map_or(true, |f| f.contains(&attr)) { let attribute_name = self.raw_index.schema().attribute_name(attr); Some((attribute_name, Value::new(value))) From 8434ecbb436134397c01cf433e3f0c950e9fd966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 6 May 2019 14:25:11 +0200 Subject: [PATCH 10/32] feat: Introduce the RankedMap real type --- meilidb-data/src/database.rs | 10 ++++++---- meilidb-data/src/ranked_map.rs | 24 +++++++++++++++++++++++- meilidb-data/src/serde/serializer.rs | 3 +-- meilidb/src/sort_by_attr.rs | 4 ++-- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 667baf2e0..e2fb2db6a 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -225,7 +225,7 @@ impl MainIndex { fn ranked_map(&self) -> Result, Error> { match self.0.get("ranked-map")? { Some(bytes) => { - let ranked_map = bincode::deserialize(bytes.as_ref())?; + let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?; Ok(Some(ranked_map)) }, None => Ok(None), @@ -309,7 +309,7 @@ impl RawIndex { let ranked_map = { let map = match inner.get("ranked-map")? { - Some(bytes) => bincode::deserialize(bytes.as_ref())?, + Some(bytes) => RankedMap::read_from_bin(bytes.as_ref())?, None => RankedMap::default(), }; @@ -355,8 +355,10 @@ impl RawIndex { } pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { - let data = bincode::serialize(ranked_map.as_ref()).unwrap(); - self.inner.set("ranked-map", data).map(drop)?; + let mut bytes = Vec::new(); + ranked_map.as_ref().write_to_bin(&mut bytes).unwrap(); + + self.inner.set("ranked-map", bytes).map(drop)?; self.ranked_map.store(ranked_map); Ok(()) diff --git a/meilidb-data/src/ranked_map.rs b/meilidb-data/src/ranked_map.rs index 7b4ff3735..aea7c8b20 100644 --- a/meilidb-data/src/ranked_map.rs +++ b/meilidb-data/src/ranked_map.rs @@ -1,5 +1,27 @@ +use std::io::{Read, Write}; + use hashbrown::HashMap; use meilidb_core::DocumentId; + use crate::{SchemaAttr, Number}; -pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; +#[derive(Debug, Default, Clone, PartialEq, Eq)] +pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>); + +impl RankedMap { + pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) { + self.0.insert((document, attribute), number); + } + + pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option { + self.0.get(&(document, attribute)).cloned() + } + + pub fn read_from_bin(reader: R) -> bincode::Result { + bincode::deserialize_from(reader).map(RankedMap) + } + + pub fn write_to_bin(&self, writer: W) -> bincode::Result<()> { + bincode::serialize_into(writer, &self.0) + } +} diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 1857fe134..6f4ad3608 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -285,9 +285,8 @@ where T: ser::Serialize, } if props.is_ranked() { - let key = (document_id, attr); let number = value.serialize(ConvertToNumber)?; - ranked_map.insert(key, number); + ranked_map.insert(document_id, attr, number); } } diff --git a/meilidb/src/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs index f4c4bcc41..2cacaae13 100644 --- a/meilidb/src/sort_by_attr.rs +++ b/meilidb/src/sort_by_attr.rs @@ -87,8 +87,8 @@ impl<'a> SortByAttr<'a> { impl<'a> Criterion for SortByAttr<'a> { fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = self.ranked_map.get(&(lhs.id, self.attr)); - let rhs = self.ranked_map.get(&(rhs.id, self.attr)); + let lhs = self.ranked_map.get(lhs.id, self.attr); + let rhs = self.ranked_map.get(rhs.id, self.attr); match (lhs, rhs) { (Some(lhs), Some(rhs)) => { From f317a7a3221f2028cc1b1dea1f069f0f47c779a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 6 May 2019 16:25:49 +0200 Subject: [PATCH 11/32] feat: implement open/create_index on the Database type --- meilidb-data/src/database.rs | 306 +++++++++++------------------------ 1 file changed, 94 insertions(+), 212 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index e2fb2db6a..a1608aab6 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -3,7 +3,7 @@ use std::convert::TryInto; use std::io::{self, Cursor, BufRead}; use std::iter::FromIterator; use std::path::Path; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; @@ -77,126 +77,122 @@ impl fmt::Display for Error { impl error::Error for Error { } -fn index_name(name: &str) -> Vec { - format!("index-{}", name).into_bytes() -} - -fn word_index_name(name: &str) -> Vec { - format!("word-index-{}", name).into_bytes() -} - -fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { - let DocumentId(document_id) = id; - let SchemaAttr(schema_attr) = attr; - - let mut bytes = Vec::new(); - bytes.extend_from_slice(b"document-"); - bytes.extend_from_slice(&document_id.to_be_bytes()[..]); - bytes.extend_from_slice(&schema_attr.to_be_bytes()[..]); - bytes -} - -trait CursorExt { - fn consume_if_eq(&mut self, needle: &[u8]) -> bool; -} - -impl> CursorExt for Cursor { - fn consume_if_eq(&mut self, needle: &[u8]) -> bool { - let position = self.position() as usize; - let slice = self.get_ref().as_ref(); - - if slice[position..].starts_with(needle) { - self.consume(needle.len()); - true - } else { - false - } - } -} - -fn extract_document_key(key: Vec) -> io::Result<(DocumentId, SchemaAttr)> { - let mut key = Cursor::new(key); - - if !key.consume_if_eq(b"document-") { - return Err(io::Error::from(io::ErrorKind::InvalidData)) - } - - let document_id = key.read_u64::().map(DocumentId)?; - let schema_attr = key.read_u16::().map(SchemaAttr)?; - - Ok((document_id, schema_attr)) -} - -#[derive(Clone)] pub struct Database { - opened: Arc>>, + cache: RwLock>>, inner: sled::Db, } impl Database { pub fn start_default>(path: P) -> Result { + let cache = RwLock::new(HashMap::new()); let inner = sled::Db::start_default(path)?; - let opened = Arc::new(ArcSwap::new(Arc::new(HashMap::new()))); - Ok(Database { opened, inner }) + Ok(Database { cache, inner }) } - pub fn open_index(&self, name: &str) -> Result, Error> { - // check if the index was already opened - if let Some(raw_index) = self.opened.lease().get(name) { - return Ok(Some(Index(raw_index.clone()))) + pub fn open_index(&self, name: &str) -> Result>, Error> { + { + let cache = self.cache.read().unwrap(); + if let Some(index) = cache.get(name).cloned() { + return Ok(Some(index)) + } } - let raw_name = index_name(name); - if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { - let tree = self.inner.open_tree(raw_name)?; - let word_index_tree = self.inner.open_tree(word_index_name(name))?; - let raw_index = RawIndex::from_raw(tree, word_index_tree)?; + let indexes: HashSet<&str> = match self.inner.get("indexes")? { + Some(bytes) => bincode::deserialize(&bytes)?, + None => return Ok(None), + }; - self.opened.rcu(|opened| { - let mut opened = HashMap::clone(opened); - opened.insert(name.to_string(), raw_index.clone()); - opened - }); - - return Ok(Some(Index(raw_index))) + if indexes.get(name).is_none() { + return Ok(None); } - Ok(None) + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let raw_index = RawIndex { main, words, documents }; + let index = Arc::new(Index(raw_index)); + + { + let cache = self.cache.write().unwrap(); + cache.insert(name.to_string(), index.clone()); + } + + Ok(Some(index)) } - pub fn create_index(&self, name: String, schema: Schema) -> Result { - match self.open_index(&name)? { - Some(index) => { - if index.schema() != &schema { - return Err(Error::SchemaDiffer); - } - - Ok(index) - }, - None => { - let raw_name = index_name(&name); - let tree = self.inner.open_tree(raw_name)?; - let word_index_tree = self.inner.open_tree(word_index_name(&name))?; - let raw_index = RawIndex::new_from_raw(tree, word_index_tree, schema)?; - - self.opened.rcu(|opened| { - let mut opened = HashMap::clone(opened); - opened.insert(name.clone(), raw_index.clone()); - opened - }); - - Ok(Index(raw_index)) - }, + pub fn create_index(&self, name: &str, schema: Schema) -> Result, Error> { + { + let cache = self.cache.read().unwrap(); + if let Some(index) = cache.get(name).cloned() { + // TODO check if schemas are the same + return Ok(index) + } } + + let mut indexes: HashSet<&str> = match self.inner.get("indexes")? { + Some(bytes) => bincode::deserialize(&bytes)?, + None => HashSet::new(), + }; + + let new_insertion = indexes.insert(name); + + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + if let Some(prev_schema) = main.schema()? { + if prev_schema != schema { + return Err(Error::SchemaDiffer) + } + } + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let raw_index = RawIndex { main, words, documents }; + let index = Arc::new(Index(raw_index)); + + { + let cache = self.cache.write().unwrap(); + cache.insert(name.to_string(), index.clone()); + } + + Ok(index) } } -struct RawIndex2 { +#[derive(Clone)] +struct RawIndex { main: MainIndex, words: WordsIndex, documents: DocumentsIndex, } +#[derive(Clone)] struct MainIndex(Arc); impl MainIndex { @@ -233,6 +229,7 @@ impl MainIndex { } } +#[derive(Clone)] struct WordsIndex(Arc); impl WordsIndex { @@ -249,6 +246,7 @@ impl WordsIndex { } } +#[derive(Clone)] struct DocumentsIndex(Arc); impl DocumentsIndex { @@ -287,122 +285,6 @@ impl<'a> Iterator for DocumentFieldsIter<'a> { } } -#[derive(Clone)] -pub struct RawIndex { - schema: Schema, - word_index: Arc>, - ranked_map: Arc>, - inner: Arc, -} - -impl RawIndex { - fn from_raw(inner: Arc, word_index: Arc) -> Result { - let schema = { - let bytes = inner.get("schema")?; - let bytes = bytes.ok_or(Error::SchemaMissing)?; - Schema::read_from_bin(bytes.as_ref())? - }; - - let store = WordIndexTree(word_index); - let word_index = WordIndex::from_store(store)?; - let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); - - let ranked_map = { - let map = match inner.get("ranked-map")? { - Some(bytes) => RankedMap::read_from_bin(bytes.as_ref())?, - None => RankedMap::default(), - }; - - Arc::new(ArcSwap::new(Arc::new(map))) - }; - - Ok(RawIndex { schema, word_index, ranked_map, inner }) - } - - fn new_from_raw( - inner: Arc, - word_index: Arc, - schema: Schema, - ) -> Result - { - let mut schema_bytes = Vec::new(); - schema.write_to_bin(&mut schema_bytes)?; - inner.set("schema", schema_bytes)?; - - let store = WordIndexTree(word_index); - let word_index = WordIndex::from_store(store)?; - let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); - - let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); - - Ok(RawIndex { schema, word_index, ranked_map, inner }) - } - - pub fn schema(&self) -> &Schema { - &self.schema - } - - pub fn word_index(&self) -> Lease> { - self.word_index.lease() - } - - pub fn ranked_map(&self) -> Lease> { - self.ranked_map.lease() - } - - pub fn update_word_index(&self, word_index: Arc) { - self.word_index.store(word_index) - } - - pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { - let mut bytes = Vec::new(); - ranked_map.as_ref().write_to_bin(&mut bytes).unwrap(); - - self.inner.set("ranked-map", bytes).map(drop)?; - self.ranked_map.store(ranked_map); - - Ok(()) - } - - pub fn set_document_attribute( - &self, - id: DocumentId, - attr: SchemaAttr, - value: V, - ) -> Result, sled::Error> - where IVec: From, - { - let key = document_key(id, attr); - Ok(self.inner.set(key, value)?) - } - - pub fn get_document_attribute( - &self, - id: DocumentId, - attr: SchemaAttr - ) -> Result, sled::Error> - { - let key = document_key(id, attr); - Ok(self.inner.get(key)?) - } - - pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter { - let start = document_key(id, SchemaAttr::min()); - let end = document_key(id, SchemaAttr::max()); - DocumentFieldsIter(self.inner.range(start..=end)) - } - - pub fn del_document_attribute( - &self, - id: DocumentId, - attr: SchemaAttr - ) -> Result, sled::Error> - { - let key = document_key(id, attr); - Ok(self.inner.del(key)?) - } -} - #[derive(Clone)] pub struct Index(RawIndex); From 42e39f6eb5948b8eaf33cb4a85120a23b958b64b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 May 2019 12:11:22 +0200 Subject: [PATCH 12/32] feat: Introduce a simplified version of the Store trait --- meilidb-core/src/index.rs | 134 ------------------------------ meilidb-core/src/lib.rs | 12 +-- meilidb-core/src/query_builder.rs | 48 +++++------ meilidb-core/src/store.rs | 23 +++++ 4 files changed, 52 insertions(+), 165 deletions(-) delete mode 100644 meilidb-core/src/index.rs create mode 100644 meilidb-core/src/store.rs diff --git a/meilidb-core/src/index.rs b/meilidb-core/src/index.rs deleted file mode 100644 index 87b59e682..000000000 --- a/meilidb-core/src/index.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::collections::BTreeMap; -use fst::{set, IntoStreamer, Streamer}; -use sdset::{Set, SetBuf, SetOperation}; -use sdset::duo::{Union, DifferenceByKey}; -use crate::{DocIndex, DocumentId}; - -pub type Word = Vec; // TODO should be a smallvec - -pub trait Store: Clone { - type Error: std::error::Error; - - fn get_fst(&self) -> Result; - fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error>; - - fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error>; - fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error>; - fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error>; -} - -pub struct Index { - pub set: fst::Set, - pub store: S, -} - -impl Index -where S: Store, -{ - pub fn from_store(store: S) -> Result, S::Error> { - let set = store.get_fst()?; - Ok(Index { set, store }) - } - - pub fn remove_documents(&self, documents: &Set) -> Result, S::Error> { - let mut buffer = Vec::new(); - let mut builder = fst::SetBuilder::memory(); - let mut stream = self.into_stream(); - - while let Some((input, result)) = stream.next() { - let indexes = match result? { - Some(indexes) => indexes, - None => continue, - }; - - let op = DifferenceByKey::new(&indexes, documents, |x| x.document_id, |x| *x); - buffer.clear(); - op.extend_vec(&mut buffer); - - if buffer.is_empty() { - self.store.del_indexes(input)?; - } else { - builder.insert(input).unwrap(); - let indexes = Set::new_unchecked(&buffer); - self.store.set_indexes(input, indexes)?; - } - } - - let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); - self.store.set_fst(&set)?; - - Ok(Index { set, store: self.store.clone() }) - } - - pub fn insert_indexes(&self, map: BTreeMap>) -> Result, S::Error> { - let mut buffer = Vec::new(); - let mut builder = fst::SetBuilder::memory(); - let set = fst::Set::from_iter(map.keys()).unwrap(); - let mut union_ = self.set.op().add(&set).r#union(); - - while let Some(input) = union_.next() { - let remote = self.store.get_indexes(input)?; - let locale = map.get(input); - - match (remote, locale) { - (Some(remote), Some(locale)) => { - buffer.clear(); - Union::new(&remote, &locale).extend_vec(&mut buffer); - let indexes = Set::new_unchecked(&buffer); - - if !indexes.is_empty() { - self.store.set_indexes(input, indexes)?; - builder.insert(input).unwrap(); - } else { - self.store.del_indexes(input)?; - } - }, - (None, Some(locale)) => { - self.store.set_indexes(input, &locale)?; - builder.insert(input).unwrap(); - }, - (Some(_), None) => { - builder.insert(input).unwrap(); - }, - (None, None) => unreachable!(), - } - } - - let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); - self.store.set_fst(&set)?; - - Ok(Index { set, store: self.store.clone() }) - } -} - -pub struct Stream<'m, S> { - set_stream: set::Stream<'m>, - store: &'m S, -} - -impl<'m, 'a, S> Streamer<'a> for Stream<'m, S> -where S: 'a + Store, -{ - type Item = (&'a [u8], Result>, S::Error>); - - fn next(&'a mut self) -> Option { - match self.set_stream.next() { - Some(input) => Some((input, self.store.get_indexes(input))), - None => None, - } - } -} - -impl<'m, 'a, S> IntoStreamer<'a> for &'m Index -where S: 'a + Store, -{ - type Item = (&'a [u8], Result>, S::Error>); - type Into = Stream<'m, S>; - - fn into_stream(self) -> Self::Into { - Stream { - set_stream: self.set.into_stream(), - store: &self.store, - } - } -} diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 783740b1b..838d787dd 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,18 +1,18 @@ -pub mod criterion; -mod index; mod automaton; -mod query_builder; mod distinct_map; +mod query_builder; +mod store; +pub mod criterion; use std::sync::Arc; -use serde::{Serialize, Deserialize}; -use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; +use serde::{Serialize, Deserialize}; +use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; -pub use self::index::{Index, Store}; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; +pub use self::store::Store; /// Represent an internally generated document unique identifier. /// diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index b5ff3a530..25cd10b2a 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,5 +1,5 @@ use std::hash::Hash; -use std::ops::{Range, Deref}; +use std::ops::Range; use std::rc::Rc; use std::time::Instant; use std::{cmp, mem}; @@ -15,7 +15,7 @@ use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; -use crate::{Match, DocumentId, Index, Store, RawDocument, Document}; +use crate::{Match, DocumentId, Store, RawDocument, Document}; fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -35,37 +35,37 @@ fn generate_automatons(query: &str) -> Vec { automatons } -pub struct QueryBuilder<'c, I, FI = fn(DocumentId) -> bool> { - index: I, +pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { + store: S, criteria: Criteria<'c>, searchable_attrs: Option>, filter: Option, } -impl<'c, I> QueryBuilder<'c, I, fn(DocumentId) -> bool> { - pub fn new(index: I) -> Self { - QueryBuilder::with_criteria(index, Criteria::default()) +impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { + pub fn new(store: S) -> Self { + QueryBuilder::with_criteria(store, Criteria::default()) } - pub fn with_criteria(index: I, criteria: Criteria<'c>) -> Self { - QueryBuilder { index, criteria, searchable_attrs: None, filter: None } + pub fn with_criteria(store: S, criteria: Criteria<'c>) -> Self { + QueryBuilder { store, criteria, searchable_attrs: None, filter: None } } } -impl<'c, I, FI> QueryBuilder<'c, I, FI> +impl<'c, S, FI> QueryBuilder<'c, S, FI> { - pub fn with_filter(self, function: F) -> QueryBuilder<'c, I, F> + pub fn with_filter(self, function: F) -> QueryBuilder<'c, S, F> where F: Fn(DocumentId) -> bool, { QueryBuilder { - index: self.index, + store: self.store, criteria: self.criteria, searchable_attrs: self.searchable_attrs, filter: Some(function) } } - pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, I, FI, F> + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, S, FI, F> where F: Fn(DocumentId) -> Option, K: Hash + Eq, { @@ -82,18 +82,18 @@ impl<'c, I, FI> QueryBuilder<'c, I, FI> } } -impl<'c, I, FI, S> QueryBuilder<'c, I, FI> -where I: Deref>, - S: Store, +impl<'c, S, FI> QueryBuilder<'c, S, FI> +where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { let automatons = generate_automatons(query); - let fst = self.index.set.as_fst(); + let words = self.store.words()?; + let words = words.as_fst(); let mut stream = { let mut op_builder = fst::raw::OpBuilder::new(); for automaton in &automatons { - let stream = fst.search(automaton); + let stream = words.search(automaton); op_builder.push(stream); } op_builder.r#union() @@ -107,7 +107,7 @@ where I: Deref>, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - let doc_indexes = self.index.store.get_indexes(input)?; + let doc_indexes = self.store.word_indexes(input)?; let doc_indexes = doc_indexes.expect("word doc-indexes not found"); for di in doc_indexes.as_slice() { @@ -137,10 +137,9 @@ where I: Deref>, } } -impl<'c, I, FI, S> QueryBuilder<'c, I, FI> -where I: Deref>, +impl<'c, S, FI> QueryBuilder<'c, S, FI> +where S: Store, FI: Fn(DocumentId) -> bool, - S: Store, { pub fn query(self, query: &str, range: Range) -> Result, S::Error> { // We delegate the filter work to the distinct query builder, @@ -215,12 +214,11 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } } -impl<'c, I, FI, FD, K, S> DistinctQueryBuilder<'c, I, FI, FD> -where I: Deref>, +impl<'c, S, FI, FD, K> DistinctQueryBuilder<'c, S, FI, FD> +where S: Store, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, - S: Store, { pub fn query(self, query: &str, range: Range) -> Result, S::Error> { let start = Instant::now(); diff --git a/meilidb-core/src/store.rs b/meilidb-core/src/store.rs new file mode 100644 index 000000000..14e95f0cc --- /dev/null +++ b/meilidb-core/src/store.rs @@ -0,0 +1,23 @@ +use std::error::Error; +use fst::Set; +use sdset::SetBuf; +use crate::DocIndex; + +pub trait Store { + type Error: Error; + + fn words(&self) -> Result<&Set, Self::Error>; + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error>; +} + +impl Store for &'_ T where T: Store { + type Error = T::Error; + + fn words(&self) -> Result<&Set, Self::Error> { + (*self).words() + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + (*self).word_indexes(word) + } +} From e67ada8823c94d7c0f360ca11b5c31c7d59234c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 May 2019 14:23:39 +0200 Subject: [PATCH 13/32] feat: Introduce the DocumentsAddition type --- meilidb-core/src/query_builder.rs | 3 +- meilidb-data/src/database.rs | 420 ++++++++++++++++--------- meilidb-data/src/indexer.rs | 68 +--- meilidb-data/src/serde/deserializer.rs | 18 +- meilidb-data/src/serde/serializer.rs | 12 +- 5 files changed, 289 insertions(+), 232 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 25cd10b2a..73d0601a9 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -87,8 +87,7 @@ where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { let automatons = generate_automatons(query); - let words = self.store.words()?; - let words = words.as_fst(); + let words = self.store.words()?.as_fst(); let mut stream = { let mut op_builder = fst::raw::OpBuilder::new(); diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index a1608aab6..20773e00d 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,4 +1,5 @@ -use std::collections::HashSet; +use std::collections::{HashSet, HashMap}; +use std::collections::hash_map::Entry; use std::convert::TryInto; use std::io::{self, Cursor, BufRead}; use std::iter::FromIterator; @@ -8,21 +9,19 @@ use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; use byteorder::{ReadBytesExt, BigEndian}; -use hashbrown::HashMap; -use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId, DocIndex}; +use meilidb_core::{criterion::Criteria, QueryBuilder, Store, DocumentId, DocIndex}; use rmp_serde::decode::{Error as RmpError}; -use sdset::SetBuf; +use sdset::{Set, SetBuf, SetOperation, duo::Union}; use serde::de; use sled::IVec; use zerocopy::{AsBytes, LayoutVerified}; +use fst::{SetBuilder, set::OpBuilder}; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; -use crate::indexer::{Indexer, WordIndexTree}; +use crate::indexer::Indexer; use crate::document_attr_key::DocumentAttrKey; -pub type WordIndex = meilidb_core::Index; - #[derive(Debug)] pub enum Error { SchemaDiffer, @@ -97,106 +96,113 @@ impl Database { } } - let indexes: HashSet<&str> = match self.inner.get("indexes")? { - Some(bytes) => bincode::deserialize(&bytes)?, - None => return Ok(None), + let mut cache = self.cache.write().unwrap(); + let index = match cache.entry(name.to_string()) { + Entry::Occupied(occupied) => { + occupied.get().clone() + }, + Entry::Vacant(vacant) => { + let bytes = match self.inner.get("indexes")? { + Some(bytes) => bytes, + None => return Ok(None), + }; + + let indexes: HashSet<&str> = bincode::deserialize(&bytes)?; + if indexes.get(name).is_none() { + return Ok(None); + } + + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let raw_index = RawIndex { main, words, documents }; + let index = Index::from_raw(raw_index)?; + + vacant.insert(Arc::new(index)).clone() + }, }; - if indexes.get(name).is_none() { - return Ok(None); - } - - let main = { - let tree = self.inner.open_tree(name)?; - MainIndex(tree) - }; - - let words = { - let tree_name = format!("{}-words", name); - let tree = self.inner.open_tree(tree_name)?; - WordsIndex(tree) - }; - - let documents = { - let tree_name = format!("{}-documents", name); - let tree = self.inner.open_tree(tree_name)?; - DocumentsIndex(tree) - }; - - let raw_index = RawIndex { main, words, documents }; - let index = Arc::new(Index(raw_index)); - - { - let cache = self.cache.write().unwrap(); - cache.insert(name.to_string(), index.clone()); - } - Ok(Some(index)) } pub fn create_index(&self, name: &str, schema: Schema) -> Result, Error> { - { - let cache = self.cache.read().unwrap(); - if let Some(index) = cache.get(name).cloned() { - // TODO check if schemas are the same - return Ok(index) - } - } + let mut cache = self.cache.write().unwrap(); - let mut indexes: HashSet<&str> = match self.inner.get("indexes")? { - Some(bytes) => bincode::deserialize(&bytes)?, - None => HashSet::new(), + let index = match cache.entry(name.to_string()) { + Entry::Occupied(occupied) => { + occupied.get().clone() + }, + Entry::Vacant(vacant) => { + let bytes = self.inner.get("indexes")?; + let bytes = bytes.as_ref(); + + let mut indexes: HashSet<&str> = match bytes { + Some(bytes) => bincode::deserialize(bytes)?, + None => HashSet::new(), + }; + + let new_insertion = indexes.insert(name); + + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + if let Some(prev_schema) = main.schema()? { + if prev_schema != schema { + return Err(Error::SchemaDiffer) + } + } + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let raw_index = RawIndex { main, words, documents }; + let index = Index::from_raw(raw_index)?; + + vacant.insert(Arc::new(index)).clone() + }, }; - let new_insertion = indexes.insert(name); - - let main = { - let tree = self.inner.open_tree(name)?; - MainIndex(tree) - }; - - if let Some(prev_schema) = main.schema()? { - if prev_schema != schema { - return Err(Error::SchemaDiffer) - } - } - - let words = { - let tree_name = format!("{}-words", name); - let tree = self.inner.open_tree(tree_name)?; - WordsIndex(tree) - }; - - let documents = { - let tree_name = format!("{}-documents", name); - let tree = self.inner.open_tree(tree_name)?; - DocumentsIndex(tree) - }; - - let raw_index = RawIndex { main, words, documents }; - let index = Arc::new(Index(raw_index)); - - { - let cache = self.cache.write().unwrap(); - cache.insert(name.to_string(), index.clone()); - } - Ok(index) } } #[derive(Clone)] -struct RawIndex { - main: MainIndex, - words: WordsIndex, - documents: DocumentsIndex, +pub struct RawIndex { + pub main: MainIndex, + pub words: WordsIndex, + pub documents: DocumentsIndex, } #[derive(Clone)] -struct MainIndex(Arc); +pub struct MainIndex(Arc); impl MainIndex { - fn schema(&self) -> Result, Error> { + pub fn schema(&self) -> Result, Error> { match self.0.get("schema")? { Some(bytes) => { let schema = Schema::read_from_bin(bytes.as_ref())?; @@ -206,7 +212,7 @@ impl MainIndex { } } - fn words_set(&self) -> Result, Error> { + pub fn words_set(&self) -> Result, Error> { match self.0.get("words")? { Some(bytes) => { let len = bytes.len(); @@ -218,7 +224,12 @@ impl MainIndex { } } - fn ranked_map(&self) -> Result, Error> { + pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> { + self.0.set("words", value.as_fst().as_bytes())?; + Ok(()) + } + + pub fn ranked_map(&self) -> Result, Error> { match self.0.get("ranked-map")? { Some(bytes) => { let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?; @@ -227,13 +238,20 @@ impl MainIndex { None => Ok(None), } } + + pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> { + let mut bytes = Vec::new(); + value.write_to_bin(&mut bytes)?; + self.0.set("ranked_map", bytes)?; + Ok(()) + } } #[derive(Clone)] -struct WordsIndex(Arc); +pub struct WordsIndex(Arc); impl WordsIndex { - fn doc_indexes(&self, word: &[u8]) -> Result>, Error> { + pub fn doc_indexes(&self, word: &[u8]) -> sled::Result>> { match self.0.get(word)? { Some(bytes) => { let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout"); @@ -244,18 +262,33 @@ impl WordsIndex { None => Ok(None), } } + + pub fn set_doc_indexes(&self, word: &[u8], set: Option<&Set>) -> sled::Result<()> { + match set { + Some(set) => self.0.set(word, set.as_bytes())?, + None => self.0.del(word)?, + }; + + Ok(()) + } } #[derive(Clone)] -struct DocumentsIndex(Arc); +pub struct DocumentsIndex(Arc); impl DocumentsIndex { - fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { + pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result> { let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.get(key).map_err(Into::into) + self.0.get(key) } - fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec) -> sled::Result<()> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.set(key, value)?; + Ok(()) + } + + pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { let start = DocumentAttrKey::new(id, SchemaAttr::min()); let start = start.to_be_bytes(); @@ -269,7 +302,7 @@ impl DocumentsIndex { pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); impl<'a> Iterator for DocumentFieldsIter<'a> { - type Item = Result<(SchemaAttr, IVec), Error>; + type Item = sled::Result<(SchemaAttr, IVec)>; fn next(&mut self) -> Option { match self.0.next() { @@ -279,51 +312,72 @@ impl<'a> Iterator for DocumentFieldsIter<'a> { let key = DocumentAttrKey::from_be_bytes(array); Some(Ok((key.attribute, value))) }, - Some(Err(e)) => Some(Err(Error::SledError(e))), + Some(Err(e)) => Some(Err(e)), None => None, } } } #[derive(Clone)] -pub struct Index(RawIndex); +pub struct Index(ArcSwap); + +pub struct InnerIndex { + pub words: fst::Set, + pub schema: Schema, + pub ranked_map: RankedMap, + pub raw: RawIndex, // TODO this will be a snapshot in the future +} impl Index { - pub fn query_builder(&self) -> QueryBuilder>> { - let word_index = self.word_index(); - QueryBuilder::new(word_index) + fn from_raw(raw: RawIndex) -> Result { + let words = match raw.main.words_set()? { + Some(words) => words, + None => fst::Set::default(), + }; + + let schema = match raw.main.schema()? { + Some(schema) => schema, + None => return Err(Error::SchemaMissing), + }; + + let ranked_map = match raw.main.ranked_map()? { + Some(map) => map, + None => RankedMap::default(), + }; + + let inner = InnerIndex { words, schema, ranked_map, raw }; + let index = Index(ArcSwap::new(Arc::new(inner))); + + Ok(index) + } + + pub fn query_builder(&self) -> QueryBuilder { + let lease = IndexLease(self.0.lease()); + QueryBuilder::new(lease) } pub fn query_builder_with_criteria<'c>( &self, criteria: Criteria<'c>, - ) -> QueryBuilder<'c, Lease>> + ) -> QueryBuilder<'c, IndexLease> { - let word_index = self.word_index(); - QueryBuilder::with_criteria(word_index, criteria) + let lease = IndexLease(self.0.lease()); + QueryBuilder::with_criteria(lease, criteria) } - pub fn schema(&self) -> &Schema { - self.0.schema() - } - - pub fn word_index(&self) -> Lease> { - self.0.word_index() - } - - pub fn ranked_map(&self) -> Lease> { - self.0.ranked_map() + pub fn lease_inner(&self) -> Lease> { + self.0.lease() } pub fn documents_addition(&self) -> DocumentsAddition { - let index = self.0.clone(); - let ranked_map = self.0.ranked_map().clone(); - DocumentsAddition::from_raw(index, ranked_map) + let ranked_map = self.0.lease().ranked_map.clone(); + DocumentsAddition::new(self, ranked_map) } pub fn documents_deletion(&self) -> DocumentsDeletion { - let index = self.0.clone(); - DocumentsDeletion::from_raw(index) + // let index = self.0.clone(); + // DocumentsDeletion::from_raw(index) + unimplemented!() } pub fn document( @@ -333,17 +387,18 @@ impl Index { ) -> Result, RmpError> where T: de::DeserializeOwned, { - let fields = match fields { - Some(fields) => { - let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n)); - Some(HashSet::from_iter(iter)) - }, - None => None, - }; + let schema = &self.lease_inner().schema; + let fields = fields + .map(|fields| { + fields + .into_iter() + .filter_map(|name| schema.attribute(name)) + .collect() + }); let mut deserializer = Deserializer { document_id: id, - raw_index: &self.0, + index: &self, fields: fields.as_ref(), }; @@ -353,21 +408,35 @@ impl Index { } } -pub struct DocumentsAddition { - inner: RawIndex, +pub struct IndexLease(Lease>); + +impl Store for IndexLease { + type Error = Error; + + fn words(&self) -> Result<&fst::Set, Self::Error> { + Ok(&self.0.words) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.0.raw.words.doc_indexes(word)?) + } +} + +pub struct DocumentsAddition<'a> { + inner: &'a Index, indexer: Indexer, ranked_map: RankedMap, } -impl DocumentsAddition { - pub fn from_raw(inner: RawIndex, ranked_map: RankedMap) -> DocumentsAddition { +impl<'a> DocumentsAddition<'a> { + fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { DocumentsAddition { inner, indexer: Indexer::new(), ranked_map } } pub fn update_document(&mut self, document: D) -> Result<(), Error> where D: serde::Serialize, { - let schema = self.inner.schema(); + let schema = &self.inner.lease_inner().schema; let identifier = schema.identifier_name(); let document_id = match extract_document_id(identifier, &document)? { @@ -375,6 +444,12 @@ impl DocumentsAddition { None => return Err(Error::MissingDocumentId), }; + // 1. remove the previous document match indexes + let mut documents_deletion = DocumentsDeletion::new(self.inner); + documents_deletion.delete_document(document_id); + documents_deletion.finalize()?; + + // 2. index the document fields let serializer = Serializer { schema, index: &self.inner, @@ -388,30 +463,70 @@ impl DocumentsAddition { Ok(()) } - pub fn finalize(self) -> sled::Result<()> { + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let main = &lease_inner.raw.main; + let words = &lease_inner.raw.words; + let delta_index = self.indexer.build(); + let mut delta_words_builder = SetBuilder::memory(); - let index = self.inner.word_index(); - let new_index = index.insert_indexes(delta_index)?; + for (word, delta_set) in delta_index { + delta_words_builder.insert(&word).unwrap(); - let new_index = Arc::from(new_index); - self.inner.update_word_index(new_index); + let set = match words.doc_indexes(&word)? { + Some(set) => Union::new(&set, &delta_set).into_set_buf(), + None => delta_set, + }; + + words.set_doc_indexes(&word, Some(&set))?; + } + + let delta_words = delta_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let words = match main.words_set()? { + Some(words) => { + let op = OpBuilder::new() + .add(words.stream()) + .add(delta_words.stream()) + .r#union(); + + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_words, + }; + + main.set_words_set(&words)?; + main.set_ranked_map(&self.ranked_map)?; + + // update the "consistent" view of the Index + let ranked_map = self.ranked_map; + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + + let inner = InnerIndex { words, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); Ok(()) } } -pub struct DocumentsDeletion { - inner: RawIndex, +pub struct DocumentsDeletion<'a> { + inner: &'a Index, documents: Vec, } -impl DocumentsDeletion { - pub fn from_raw(inner: RawIndex) -> DocumentsDeletion { - DocumentsDeletion { - inner, - documents: Vec::new(), - } +impl<'a> DocumentsDeletion<'a> { + fn new(inner: &'a Index) -> DocumentsDeletion { + DocumentsDeletion { inner, documents: Vec::new() } } pub fn delete_document(&mut self, id: DocumentId) { @@ -423,13 +538,16 @@ impl DocumentsDeletion { self.documents.dedup(); let idset = SetBuf::new_unchecked(self.documents); - let index = self.inner.word_index(); - let new_index = index.remove_documents(&idset)?; - let new_index = Arc::from(new_index); + // let index = self.inner.word_index(); - self.inner.update_word_index(new_index); + // let new_index = index.remove_documents(&idset)?; + // let new_index = Arc::from(new_index); - Ok(()) + // self.inner.update_word_index(new_index); + + // Ok(()) + + unimplemented!("documents deletion finalize") } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 3405a79de..a578f52c7 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -1,78 +1,13 @@ use std::collections::BTreeMap; use std::convert::TryFrom; -use std::sync::Arc; use deunicode::deunicode_with_tofu; use meilidb_core::{DocumentId, DocIndex, Store}; use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; -use sdset::{Set, SetBuf}; -use sled::Tree; -use zerocopy::{AsBytes, LayoutVerified}; +use sdset::SetBuf; use crate::SchemaAttr; -#[derive(Clone)] -pub struct WordIndexTree(pub Arc); - -impl Store for WordIndexTree { - type Error = sled::Error; - - fn get_fst(&self) -> Result { - match self.0.get("fst")? { - Some(bytes) => { - let bytes: Arc<[u8]> = bytes.into(); - let len = bytes.len(); - let raw = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(fst::Set::from(raw)) - }, - None => Ok(fst::Set::default()), - } - } - - fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error> { - let bytes = set.as_fst().to_vec(); - self.0.set("fst", bytes)?; - Ok(()) - } - - fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - let mut word_bytes = Vec::from("word-"); - word_bytes.extend_from_slice(word); - - match self.0.get(word_bytes)? { - Some(bytes) => { - let layout = LayoutVerified::new_slice(bytes.as_ref()).unwrap(); - let slice = layout.into_slice(); - let setbuf = SetBuf::new_unchecked(slice.to_vec()); - Ok(Some(setbuf)) - }, - None => Ok(None), - } - } - - fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error> { - let mut word_bytes = Vec::from("word-"); - word_bytes.extend_from_slice(word); - - let slice = indexes.as_slice(); - let bytes = slice.as_bytes(); - - self.0.set(word_bytes, bytes)?; - - Ok(()) - } - - fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error> { - let mut word_bytes = Vec::from("word-"); - word_bytes.extend_from_slice(word); - - self.0.del(word_bytes)?; - - Ok(()) - } - -} - type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { @@ -115,6 +50,7 @@ impl Indexer { pub fn build(self) -> BTreeMap> { self.indexed.into_iter().map(|(word, mut indexes)| { indexes.sort_unstable(); + indexes.dedup(); (word, SetBuf::new_unchecked(indexes)) }).collect() } diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-data/src/serde/deserializer.rs index dbebd8abf..5c7baec38 100644 --- a/meilidb-data/src/serde/deserializer.rs +++ b/meilidb-data/src/serde/deserializer.rs @@ -6,12 +6,12 @@ use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; use rmp_serde::decode::{Error as RmpError}; use serde::{de, forward_to_deserialize_any}; -use crate::database::RawIndex; +use crate::database::Index; use crate::SchemaAttr; pub struct Deserializer<'a> { pub document_id: DocumentId, - pub raw_index: &'a RawIndex, + pub index: &'a Index, pub fields: Option<&'a HashSet>, } @@ -26,15 +26,18 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> } forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq - bytes byte_buf unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct struct enum identifier ignored_any } fn deserialize_map(self, visitor: V) -> Result where V: de::Visitor<'de> { - let document_attributes = self.raw_index.get_document_fields(self.document_id); + let schema = &self.index.lease_inner().schema; + let documents = &self.index.lease_inner().raw.documents; + + let document_attributes = documents.document_fields(self.document_id); let document_attributes = document_attributes.filter_map(|result| { match result { Ok(value) => Some(value), @@ -45,9 +48,10 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> }, } }); + let iter = document_attributes.filter_map(|(attr, value)| { if self.fields.map_or(true, |f| f.contains(&attr)) { - let attribute_name = self.raw_index.schema().attribute_name(attr); + let attribute_name = schema.attribute_name(attr); Some((attribute_name, Value::new(value))) } else { None diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 6f4ad3608..d7a7b7d85 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -1,7 +1,7 @@ use meilidb_core::DocumentId; use serde::ser; -use crate::database::RawIndex; +use crate::database::Index; use crate::ranked_map::RankedMap; use crate::indexer::Indexer as RawIndexer; use crate::schema::Schema; @@ -9,7 +9,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, - pub index: &'a RawIndex, + pub index: &'a Index, pub indexer: &'a mut RawIndexer, pub ranked_map: &'a mut RankedMap, pub document_id: DocumentId, @@ -171,7 +171,7 @@ impl<'a> ser::Serializer for Serializer<'a> { pub struct MapSerializer<'a> { schema: &'a Schema, document_id: DocumentId, - index: &'a RawIndex, + index: &'a Index, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, current_key_name: Option, @@ -224,7 +224,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { pub struct StructSerializer<'a> { schema: &'a Schema, document_id: DocumentId, - index: &'a RawIndex, + index: &'a Index, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, } @@ -259,7 +259,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { fn serialize_value( schema: &Schema, document_id: DocumentId, - index: &RawIndex, + index: &Index, indexer: &mut RawIndexer, ranked_map: &mut RankedMap, key: &str, @@ -272,7 +272,7 @@ where T: ser::Serialize, if props.is_stored() { let value = rmp_serde::to_vec_named(value)?; - index.set_document_attribute(document_id, attr, value)?; + index.lease_inner().raw.documents.set_document_field(document_id, attr, value)?; } if props.is_indexed() { From 1f2abce7c398041c12302ac3c9e2108432a40d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 May 2019 16:16:07 +0200 Subject: [PATCH 14/32] feat: Introduce the DocumentsDeletion type --- meilidb-core/src/query_builder.rs | 5 +- meilidb-data/Cargo.toml | 1 - meilidb-data/src/database.rs | 258 ++++++++++++++++++++++++------ meilidb-data/src/indexer.rs | 74 ++++++--- 4 files changed, 271 insertions(+), 67 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 73d0601a9..058c61ecc 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -107,7 +107,10 @@ where S: Store, let is_exact = distance == 0 && input.len() == automaton.query_len(); let doc_indexes = self.store.word_indexes(input)?; - let doc_indexes = doc_indexes.expect("word doc-indexes not found"); + let doc_indexes = match doc_indexes { + Some(doc_indexes) => doc_indexes, + None => continue, + }; for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index c883196d4..eb0d92f55 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [dependencies] arc-swap = "0.3.11" bincode = "1.1.2" -byteorder = "1.3.1" deunicode = "1.0.0" hashbrown = { version = "0.2.2", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 20773e00d..3336738c0 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,25 +1,22 @@ -use std::collections::{HashSet, HashMap}; +use std::collections::{BTreeSet, HashSet, HashMap}; use std::collections::hash_map::Entry; use std::convert::TryInto; -use std::io::{self, Cursor, BufRead}; -use std::iter::FromIterator; use std::path::Path; use std::sync::{Arc, RwLock}; use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; -use byteorder::{ReadBytesExt, BigEndian}; use meilidb_core::{criterion::Criteria, QueryBuilder, Store, DocumentId, DocIndex}; use rmp_serde::decode::{Error as RmpError}; -use sdset::{Set, SetBuf, SetOperation, duo::Union}; +use sdset::{Set, SetBuf, SetOperation, duo::{Union, DifferenceByKey}}; use serde::de; use sled::IVec; use zerocopy::{AsBytes, LayoutVerified}; -use fst::{SetBuilder, set::OpBuilder}; +use fst::{SetBuilder, set::OpBuilder, Streamer}; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; -use crate::indexer::Indexer; +use crate::indexer::{Indexer, Indexed}; use crate::document_attr_key::DocumentAttrKey; #[derive(Debug)] @@ -88,6 +85,22 @@ impl Database { Ok(Database { cache, inner }) } + pub fn indexes(&self) -> Result>, Error> { + let bytes = match self.inner.get("indexes")? { + Some(bytes) => bytes, + None => return Ok(None), + }; + + let indexes = bincode::deserialize(&bytes)?; + Ok(Some(indexes)) + } + + pub fn set_indexes(&self, value: &HashSet) -> Result<(), Error> { + let bytes = bincode::serialize(value)?; + self.inner.set("indexes", bytes)?; + Ok(()) + } + pub fn open_index(&self, name: &str) -> Result>, Error> { { let cache = self.cache.read().unwrap(); @@ -102,14 +115,8 @@ impl Database { occupied.get().clone() }, Entry::Vacant(vacant) => { - let bytes = match self.inner.get("indexes")? { - Some(bytes) => bytes, - None => return Ok(None), - }; - - let indexes: HashSet<&str> = bincode::deserialize(&bytes)?; - if indexes.get(name).is_none() { - return Ok(None); + if !self.indexes()?.map_or(false, |x| !x.contains(name)) { + return Ok(None) } let main = { @@ -123,13 +130,19 @@ impl Database { WordsIndex(tree) }; + let attrs_words = { + let tree_name = format!("{}-attrs-words", name); + let tree = self.inner.open_tree(tree_name)?; + AttrsWords(tree) + }; + let documents = { let tree_name = format!("{}-documents", name); let tree = self.inner.open_tree(tree_name)?; DocumentsIndex(tree) }; - let raw_index = RawIndex { main, words, documents }; + let raw_index = RawIndex { main, words, attrs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -147,16 +160,6 @@ impl Database { occupied.get().clone() }, Entry::Vacant(vacant) => { - let bytes = self.inner.get("indexes")?; - let bytes = bytes.as_ref(); - - let mut indexes: HashSet<&str> = match bytes { - Some(bytes) => bincode::deserialize(bytes)?, - None => HashSet::new(), - }; - - let new_insertion = indexes.insert(name); - let main = { let tree = self.inner.open_tree(name)?; MainIndex(tree) @@ -168,19 +171,31 @@ impl Database { } } + main.set_schema(&schema)?; + let words = { let tree_name = format!("{}-words", name); let tree = self.inner.open_tree(tree_name)?; WordsIndex(tree) }; + let attrs_words = { + let tree_name = format!("{}-attrs-words", name); + let tree = self.inner.open_tree(tree_name)?; + AttrsWords(tree) + }; + let documents = { let tree_name = format!("{}-documents", name); let tree = self.inner.open_tree(tree_name)?; DocumentsIndex(tree) }; - let raw_index = RawIndex { main, words, documents }; + let mut indexes = self.indexes()?.unwrap_or_else(HashSet::new); + indexes.insert(name.to_string()); + self.set_indexes(&indexes)?; + + let raw_index = RawIndex { main, words, attrs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -195,6 +210,7 @@ impl Database { pub struct RawIndex { pub main: MainIndex, pub words: WordsIndex, + pub attrs_words: AttrsWords, pub documents: DocumentsIndex, } @@ -212,6 +228,13 @@ impl MainIndex { } } + pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> { + let mut bytes = Vec::new(); + schema.write_to_bin(&mut bytes)?; + self.0.set("schema", bytes)?; + Ok(()) + } + pub fn words_set(&self) -> Result, Error> { match self.0.get("words")? { Some(bytes) => { @@ -263,16 +286,87 @@ impl WordsIndex { } } - pub fn set_doc_indexes(&self, word: &[u8], set: Option<&Set>) -> sled::Result<()> { - match set { - Some(set) => self.0.set(word, set.as_bytes())?, - None => self.0.del(word)?, + pub fn set_doc_indexes(&self, word: &[u8], set: &Set) -> sled::Result<()> { + self.0.set(word, set.as_bytes())?; + Ok(()) + } + + pub fn del_doc_indexes(&self, word: &[u8]) -> sled::Result<()> { + self.0.del(word)?; + Ok(()) + } +} + +#[derive(Clone)] +pub struct AttrsWords(Arc); + +impl AttrsWords { + pub fn attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + match self.0.get(key)? { + Some(bytes) => { + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None) + } + } + + pub fn attrs_words(&self, id: DocumentId) -> DocumentAttrsWordsIter { + let start = DocumentAttrKey::new(id, SchemaAttr::min()); + let start = start.to_be_bytes(); + + let end = DocumentAttrKey::new(id, SchemaAttr::max()); + let end = end.to_be_bytes(); + + DocumentAttrsWordsIter(self.0.range(start..=end)) + } + + pub fn set_attr_words( + &self, + id: DocumentId, + attr: SchemaAttr, + words: Option<&fst::Set>, + ) -> Result<(), Error> + { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + + match words { + Some(words) => self.0.set(key, words.as_fst().as_bytes())?, + None => self.0.del(key)?, }; Ok(()) } } +pub struct DocumentAttrsWordsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentAttrsWordsIter<'a> { + type Item = sled::Result<(SchemaAttr, fst::Set)>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, bytes))) => { + let slice: &[u8] = key.as_ref(); + let array = slice.try_into().unwrap(); + let key = DocumentAttrKey::from_be_bytes(array); + + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len).unwrap(); + let set = fst::Set::from(fst); + + Some(Ok((key.attribute, set))) + }, + Some(Err(e)) => Some(Err(e.into())), + None => None, + } + } +} + #[derive(Clone)] pub struct DocumentsIndex(Arc); @@ -288,6 +382,12 @@ impl DocumentsIndex { Ok(()) } + pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result<()> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.del(key)?; + Ok(()) + } + pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { let start = DocumentAttrKey::new(id, SchemaAttr::min()); let start = start.to_be_bytes(); @@ -375,9 +475,7 @@ impl Index { } pub fn documents_deletion(&self) -> DocumentsDeletion { - // let index = self.0.clone(); - // DocumentsDeletion::from_raw(index) - unimplemented!() + DocumentsDeletion::new(self) } pub fn document( @@ -467,11 +565,12 @@ impl<'a> DocumentsAddition<'a> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; let words = &lease_inner.raw.words; + let attrs_words = &lease_inner.raw.attrs_words; - let delta_index = self.indexer.build(); + let Indexed { words_doc_indexes, docs_attrs_words } = self.indexer.build(); let mut delta_words_builder = SetBuilder::memory(); - for (word, delta_set) in delta_index { + for (word, delta_set) in words_doc_indexes { delta_words_builder.insert(&word).unwrap(); let set = match words.doc_indexes(&word)? { @@ -479,7 +578,11 @@ impl<'a> DocumentsAddition<'a> { None => delta_set, }; - words.set_doc_indexes(&word, Some(&set))?; + words.set_doc_indexes(&word, &set)?; + } + + for ((id, attr), words) in docs_attrs_words { + attrs_words.set_attr_words(id, attr, Some(&words))?; } let delta_words = delta_words_builder @@ -534,20 +637,83 @@ impl<'a> DocumentsDeletion<'a> { } pub fn finalize(mut self) -> Result<(), Error> { - self.documents.sort_unstable(); - self.documents.dedup(); + let lease_inner = self.inner.lease_inner(); + let main = &lease_inner.raw.main; + let attrs_words = &lease_inner.raw.attrs_words; + let words = &lease_inner.raw.words; + let documents = &lease_inner.raw.documents; - let idset = SetBuf::new_unchecked(self.documents); + let idset = { + self.documents.sort_unstable(); + self.documents.dedup(); + SetBuf::new_unchecked(self.documents) + }; - // let index = self.inner.word_index(); + let mut words_attrs = HashMap::new(); + for id in idset.into_vec() { + for result in attrs_words.attrs_words(id) { + let (attr, words) = result?; + let mut stream = words.stream(); + while let Some(word) = stream.next() { + let word = word.to_vec(); + words_attrs.entry(word).or_insert_with(Vec::new).push((id, attr)); + } + } + } - // let new_index = index.remove_documents(&idset)?; - // let new_index = Arc::from(new_index); + let mut removed_words = BTreeSet::new(); + for (word, mut attrs) in words_attrs { + attrs.sort_unstable(); + attrs.dedup(); + let attrs = SetBuf::new_unchecked(attrs); - // self.inner.update_word_index(new_index); + if let Some(doc_indexes) = words.doc_indexes(&word)? { + let op = DifferenceByKey::new(&doc_indexes, &attrs, |d| d.document_id, |(id, _)| *id); + let doc_indexes = op.into_set_buf(); - // Ok(()) + if !doc_indexes.is_empty() { + words.set_doc_indexes(&word, &doc_indexes)?; + } else { + words.del_doc_indexes(&word)?; + removed_words.insert(word); + } + } - unimplemented!("documents deletion finalize") + for (id, attr) in attrs.into_vec() { + documents.del_document_field(id, attr)?; + } + } + + let removed_words = fst::Set::from_iter(removed_words).unwrap(); + let words = match main.words_set()? { + Some(words_set) => { + let op = fst::set::OpBuilder::new() + .add(words_set.stream()) + .add(removed_words.stream()) + .difference(); + + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => fst::Set::default(), + }; + + main.set_words_set(&words)?; + + // TODO must update the ranked_map too! + + // update the "consistent" view of the Index + let ranked_map = lease_inner.ranked_map.clone(); + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + + let inner = InnerIndex { words, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index a578f52c7..1aa3095c8 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -1,8 +1,8 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; use deunicode::deunicode_with_tofu; -use meilidb_core::{DocumentId, DocIndex, Store}; +use meilidb_core::{DocumentId, DocIndex}; use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; use sdset::SetBuf; @@ -12,27 +12,39 @@ type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { word_limit: usize, // the maximum number of indexed words - indexed: BTreeMap>, + words_doc_indexes: BTreeMap>, + docs_attrs_words: HashMap<(DocumentId, SchemaAttr), Vec>, +} + +pub struct Indexed { + pub words_doc_indexes: BTreeMap>, + pub docs_attrs_words: HashMap<(DocumentId, SchemaAttr), fst::Set>, } impl Indexer { pub fn new() -> Indexer { - Indexer { - word_limit: 1000, - indexed: BTreeMap::new(), - } + Indexer::with_word_limit(1000) } pub fn with_word_limit(limit: usize) -> Indexer { Indexer { word_limit: limit, - indexed: BTreeMap::new(), + words_doc_indexes: BTreeMap::new(), + docs_attrs_words: HashMap::new(), } } pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { for token in Tokenizer::new(text) { - let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_attrs_words, + ); + if !must_continue { break } } } @@ -42,17 +54,38 @@ impl Indexer { { let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { - let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_attrs_words, + ); + if !must_continue { break } } } - pub fn build(self) -> BTreeMap> { - self.indexed.into_iter().map(|(word, mut indexes)| { - indexes.sort_unstable(); - indexes.dedup(); - (word, SetBuf::new_unchecked(indexes)) - }).collect() + pub fn build(self) -> Indexed { + let words_doc_indexes = self.words_doc_indexes + .into_iter() + .map(|(word, mut indexes)| { + indexes.sort_unstable(); + indexes.dedup(); + (word, SetBuf::new_unchecked(indexes)) + }).collect(); + + let docs_attrs_words = self.docs_attrs_words + .into_iter() + .map(|((id, attr), mut words)| { + words.sort_unstable(); + words.dedup(); + ((id, attr), fst::Set::from_iter(words).unwrap()) + }) + .collect(); + + Indexed { words_doc_indexes, docs_attrs_words } } } @@ -61,7 +94,8 @@ fn index_token( id: DocumentId, attr: SchemaAttr, word_limit: usize, - indexed: &mut BTreeMap>, + words_doc_indexes: &mut BTreeMap>, + docs_attrs_words: &mut HashMap<(DocumentId, SchemaAttr), Vec>, ) -> bool { if token.word_index >= word_limit { return false } @@ -71,7 +105,8 @@ fn index_token( match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); - indexed.entry(word).or_insert_with(Vec::new).push(docindex); + words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); + docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); }, None => return false, } @@ -83,7 +118,8 @@ fn index_token( match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); - indexed.entry(word).or_insert_with(Vec::new).push(docindex); + words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); + docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); }, None => return false, } From dff81bb16109851c1b52fd199168ee09c71e530b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 10 May 2019 11:56:17 +0200 Subject: [PATCH 15/32] feat: Prefer set/del methods instead of set with an Option type --- meilidb-data/src/database.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 3336738c0..1a5a94048 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -324,20 +324,15 @@ impl AttrsWords { DocumentAttrsWordsIter(self.0.range(start..=end)) } - pub fn set_attr_words( - &self, - id: DocumentId, - attr: SchemaAttr, - words: Option<&fst::Set>, - ) -> Result<(), Error> - { + pub fn set_attr_words(&self, id: DocumentId, attr: SchemaAttr, words: &fst::Set) -> Result<(), Error> { let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.set(key, words.as_fst().as_bytes())?; + Ok(()) + } - match words { - Some(words) => self.0.set(key, words.as_fst().as_bytes())?, - None => self.0.del(key)?, - }; - + pub fn del_attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result<(), Error> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.del(key)?; Ok(()) } } @@ -582,7 +577,7 @@ impl<'a> DocumentsAddition<'a> { } for ((id, attr), words) in docs_attrs_words { - attrs_words.set_attr_words(id, attr, Some(&words))?; + attrs_words.set_attr_words(id, attr, &words)?; } let delta_words = delta_words_builder @@ -681,6 +676,7 @@ impl<'a> DocumentsDeletion<'a> { for (id, attr) in attrs.into_vec() { documents.del_document_field(id, attr)?; + attrs_words.del_attr_words(id, attr)?; } } From 2844cb5bca66df410cd2ee9b1603e0477c74db02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 10 May 2019 12:05:19 +0200 Subject: [PATCH 16/32] fix: Make the examples compile --- meilidb-data/src/database.rs | 4 ++++ meilidb/examples/create-database.rs | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 1a5a94048..de48961f3 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -464,6 +464,10 @@ impl Index { self.0.lease() } + pub fn schema(&self) -> Schema { + self.0.lease().schema.clone() + } + pub fn documents_addition(&self) -> DocumentsAddition { let ranked_map = self.0.lease().ranked_map.clone(); DocumentsAddition::new(self, ranked_map) diff --git a/meilidb/examples/create-database.rs b/meilidb/examples/create-database.rs index b0bfa1127..8fbbe368a 100644 --- a/meilidb/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -52,7 +52,7 @@ fn index( { let database = Database::start_default(database_path)?; - let index = database.create_index("default".to_string(), schema.clone())?; + let index = database.create_index("default", schema.clone())?; let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); From 9bba90c47edf43860a5dac397ec0465c208b2438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 10 May 2019 14:44:23 +0200 Subject: [PATCH 17/32] fix: Fix a bug in the Database open-index method --- meilidb-data/src/database.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index de48961f3..745c5a8b5 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -115,7 +115,7 @@ impl Database { occupied.get().clone() }, Entry::Vacant(vacant) => { - if !self.indexes()?.map_or(false, |x| !x.contains(name)) { + if !self.indexes()?.map_or(false, |x| x.contains(name)) { return Ok(None) } From aa90f22865603224616823c752fbd6ab6cab6f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 13 May 2019 15:29:04 +0200 Subject: [PATCH 18/32] feat: Remove the Index dependency of the Serializer --- meilidb-data/src/database.rs | 46 +++++++++++++++++++++------- meilidb-data/src/serde/mod.rs | 21 +++++++++++++ meilidb-data/src/serde/serializer.rs | 23 +++++++------- 3 files changed, 67 insertions(+), 23 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 745c5a8b5..01273e186 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,5 +1,5 @@ -use std::collections::{BTreeSet, HashSet, HashMap}; use std::collections::hash_map::Entry; +use std::collections::{BTreeSet, HashSet, HashMap}; use std::convert::TryInto; use std::path::Path; use std::sync::{Arc, RwLock}; @@ -14,10 +14,11 @@ use sled::IVec; use zerocopy::{AsBytes, LayoutVerified}; use fst::{SetBuilder, set::OpBuilder, Streamer}; -use crate::{Schema, SchemaAttr, RankedMap}; -use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; -use crate::indexer::{Indexer, Indexed}; use crate::document_attr_key::DocumentAttrKey; +use crate::indexer::{Indexer, Indexed}; +use crate::serde::extract_document_id; +use crate::serde::{Serializer, RamDocumentStore, Deserializer, SerializerError}; +use crate::{Schema, SchemaAttr, RankedMap}; #[derive(Debug)] pub enum Error { @@ -521,13 +522,21 @@ impl Store for IndexLease { pub struct DocumentsAddition<'a> { inner: &'a Index, + document_ids: HashSet, + document_store: RamDocumentStore, indexer: Indexer, ranked_map: RankedMap, } impl<'a> DocumentsAddition<'a> { fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { - DocumentsAddition { inner, indexer: Indexer::new(), ranked_map } + DocumentsAddition { + inner, + document_ids: HashSet::new(), + document_store: RamDocumentStore::new(), + indexer: Indexer::new(), + ranked_map, + } } pub fn update_document(&mut self, document: D) -> Result<(), Error> @@ -541,15 +550,13 @@ impl<'a> DocumentsAddition<'a> { None => return Err(Error::MissingDocumentId), }; - // 1. remove the previous document match indexes - let mut documents_deletion = DocumentsDeletion::new(self.inner); - documents_deletion.delete_document(document_id); - documents_deletion.finalize()?; + // 1. store the document id for future deletion + self.document_ids.insert(document_id); - // 2. index the document fields + // 2. index the document fields in ram stores let serializer = Serializer { schema, - index: &self.inner, + document_store: &mut self.document_store, indexer: &mut self.indexer, ranked_map: &mut self.ranked_map, document_id, @@ -565,6 +572,17 @@ impl<'a> DocumentsAddition<'a> { let main = &lease_inner.raw.main; let words = &lease_inner.raw.words; let attrs_words = &lease_inner.raw.attrs_words; + let documents = &lease_inner.raw.documents; + + // 1. remove the previous documents match indexes + let mut documents_deletion = DocumentsDeletion::new(self.inner); + documents_deletion.extend(self.document_ids); + documents_deletion.finalize()?; + + // 2. insert new document attributes in the database + for ((id, attr), value) in self.document_store.into_inner() { + documents.set_document_field(id, attr, value)?; + } let Indexed { words_doc_indexes, docs_attrs_words } = self.indexer.build(); let mut delta_words_builder = SetBuilder::memory(); @@ -717,3 +735,9 @@ impl<'a> DocumentsDeletion<'a> { Ok(()) } } + +impl<'a> Extend for DocumentsDeletion<'a> { + fn extend>(&mut self, iter: T) { + self.documents.extend(iter) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index cf222c1bd..1e2854c36 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -22,10 +22,15 @@ pub use self::convert_to_number::ConvertToNumber; pub use self::indexer::Indexer; pub use self::serializer::Serializer; +use std::collections::BTreeMap; use std::{fmt, error::Error}; + +use meilidb_core::DocumentId; use rmp_serde::encode::Error as RmpError; use serde::ser; + use crate::number::ParseNumberError; +use crate::schema::SchemaAttr; #[derive(Debug)] pub enum SerializerError { @@ -95,3 +100,19 @@ impl From for SerializerError { SerializerError::ParseNumberError(error) } } + +pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec>); + +impl RamDocumentStore { + pub fn new() -> RamDocumentStore { + RamDocumentStore(BTreeMap::new()) + } + + pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec) { + self.0.insert((id, attr), value); + } + + pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec> { + self.0 + } +} diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index d7a7b7d85..0636d86d6 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -1,15 +1,14 @@ use meilidb_core::DocumentId; use serde::ser; -use crate::database::Index; -use crate::ranked_map::RankedMap; use crate::indexer::Indexer as RawIndexer; +use crate::ranked_map::RankedMap; use crate::schema::Schema; -use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; +use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, - pub index: &'a Index, + pub document_store: &'a mut RamDocumentStore, pub indexer: &'a mut RawIndexer, pub ranked_map: &'a mut RankedMap, pub document_id: DocumentId, @@ -134,7 +133,7 @@ impl<'a> ser::Serializer for Serializer<'a> { Ok(MapSerializer { schema: self.schema, document_id: self.document_id, - index: self.index, + document_store: self.document_store, indexer: self.indexer, ranked_map: self.ranked_map, current_key_name: None, @@ -150,7 +149,7 @@ impl<'a> ser::Serializer for Serializer<'a> { Ok(StructSerializer { schema: self.schema, document_id: self.document_id, - index: self.index, + document_store: self.document_store, indexer: self.indexer, ranked_map: self.ranked_map, }) @@ -171,7 +170,7 @@ impl<'a> ser::Serializer for Serializer<'a> { pub struct MapSerializer<'a> { schema: &'a Schema, document_id: DocumentId, - index: &'a Index, + document_store: &'a mut RamDocumentStore, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, current_key_name: Option, @@ -208,7 +207,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { serialize_value( self.schema, self.document_id, - self.index, + self.document_store, self.indexer, self.ranked_map, &key, @@ -224,7 +223,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { pub struct StructSerializer<'a> { schema: &'a Schema, document_id: DocumentId, - index: &'a Index, + document_store: &'a mut RamDocumentStore, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, } @@ -243,7 +242,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { serialize_value( self.schema, self.document_id, - self.index, + self.document_store, self.indexer, self.ranked_map, key, @@ -259,7 +258,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { fn serialize_value( schema: &Schema, document_id: DocumentId, - index: &Index, + document_store: &mut RamDocumentStore, indexer: &mut RawIndexer, ranked_map: &mut RankedMap, key: &str, @@ -272,7 +271,7 @@ where T: ser::Serialize, if props.is_stored() { let value = rmp_serde::to_vec_named(value)?; - index.lease_inner().raw.documents.set_document_field(document_id, attr, value)?; + document_store.set_document_field(document_id, attr, value); } if props.is_indexed() { From 169bd4cb399703508c562881396b6800eb3c45e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 13 May 2019 16:22:36 +0200 Subject: [PATCH 19/32] feat: Store all documents words by document rather than by attribute --- meilidb-data/src/database.rs | 118 ++++++++++++++--------------------- meilidb-data/src/indexer.rs | 24 +++---- 2 files changed, 59 insertions(+), 83 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 01273e186..badbc2bf8 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -15,7 +15,7 @@ use zerocopy::{AsBytes, LayoutVerified}; use fst::{SetBuilder, set::OpBuilder, Streamer}; use crate::document_attr_key::DocumentAttrKey; -use crate::indexer::{Indexer, Indexed}; +use crate::indexer::Indexer; use crate::serde::extract_document_id; use crate::serde::{Serializer, RamDocumentStore, Deserializer, SerializerError}; use crate::{Schema, SchemaAttr, RankedMap}; @@ -131,10 +131,10 @@ impl Database { WordsIndex(tree) }; - let attrs_words = { - let tree_name = format!("{}-attrs-words", name); + let docs_words = { + let tree_name = format!("{}-docs-words", name); let tree = self.inner.open_tree(tree_name)?; - AttrsWords(tree) + DocsWords(tree) }; let documents = { @@ -143,7 +143,7 @@ impl Database { DocumentsIndex(tree) }; - let raw_index = RawIndex { main, words, attrs_words, documents }; + let raw_index = RawIndex { main, words, docs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -180,10 +180,10 @@ impl Database { WordsIndex(tree) }; - let attrs_words = { - let tree_name = format!("{}-attrs-words", name); + let docs_words = { + let tree_name = format!("{}-docs-words", name); let tree = self.inner.open_tree(tree_name)?; - AttrsWords(tree) + DocsWords(tree) }; let documents = { @@ -196,7 +196,7 @@ impl Database { indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, attrs_words, documents }; + let raw_index = RawIndex { main, words, docs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -211,7 +211,7 @@ impl Database { pub struct RawIndex { pub main: MainIndex, pub words: WordsIndex, - pub attrs_words: AttrsWords, + pub docs_words: DocsWords, pub documents: DocumentsIndex, } @@ -299,11 +299,11 @@ impl WordsIndex { } #[derive(Clone)] -pub struct AttrsWords(Arc); +pub struct DocsWords(Arc); -impl AttrsWords { - pub fn attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); +impl DocsWords { + pub fn doc_words(&self, id: DocumentId) -> Result, Error> { + let key = id.0.to_be_bytes(); match self.0.get(key)? { Some(bytes) => { let len = bytes.len(); @@ -315,54 +315,19 @@ impl AttrsWords { } } - pub fn attrs_words(&self, id: DocumentId) -> DocumentAttrsWordsIter { - let start = DocumentAttrKey::new(id, SchemaAttr::min()); - let start = start.to_be_bytes(); - - let end = DocumentAttrKey::new(id, SchemaAttr::max()); - let end = end.to_be_bytes(); - - DocumentAttrsWordsIter(self.0.range(start..=end)) - } - - pub fn set_attr_words(&self, id: DocumentId, attr: SchemaAttr, words: &fst::Set) -> Result<(), Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> { + let key = id.0.to_be_bytes(); self.0.set(key, words.as_fst().as_bytes())?; Ok(()) } - pub fn del_attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result<(), Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> { + let key = id.0.to_be_bytes(); self.0.del(key)?; Ok(()) } } -pub struct DocumentAttrsWordsIter<'a>(sled::Iter<'a>); - -impl<'a> Iterator for DocumentAttrsWordsIter<'a> { - type Item = sled::Result<(SchemaAttr, fst::Set)>; - - fn next(&mut self) -> Option { - match self.0.next() { - Some(Ok((key, bytes))) => { - let slice: &[u8] = key.as_ref(); - let array = slice.try_into().unwrap(); - let key = DocumentAttrKey::from_be_bytes(array); - - let len = bytes.len(); - let value = bytes.into(); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len).unwrap(); - let set = fst::Set::from(fst); - - Some(Ok((key.attribute, set))) - }, - Some(Err(e)) => Some(Err(e.into())), - None => None, - } - } -} - #[derive(Clone)] pub struct DocumentsIndex(Arc); @@ -384,6 +349,18 @@ impl DocumentsIndex { Ok(()) } + pub fn del_all_document_fields(&self, id: DocumentId) -> sled::Result<()> { + let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes(); + let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes(); + let document_attrs = self.0.range(start..=end).keys(); + + for key in document_attrs { + self.0.del(key?)?; + } + + Ok(()) + } + pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { let start = DocumentAttrKey::new(id, SchemaAttr::min()); let start = start.to_be_bytes(); @@ -571,7 +548,7 @@ impl<'a> DocumentsAddition<'a> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; let words = &lease_inner.raw.words; - let attrs_words = &lease_inner.raw.attrs_words; + let docs_words = &lease_inner.raw.docs_words; let documents = &lease_inner.raw.documents; // 1. remove the previous documents match indexes @@ -584,10 +561,10 @@ impl<'a> DocumentsAddition<'a> { documents.set_document_field(id, attr, value)?; } - let Indexed { words_doc_indexes, docs_attrs_words } = self.indexer.build(); + let indexed = self.indexer.build(); let mut delta_words_builder = SetBuilder::memory(); - for (word, delta_set) in words_doc_indexes { + for (word, delta_set) in indexed.words_doc_indexes { delta_words_builder.insert(&word).unwrap(); let set = match words.doc_indexes(&word)? { @@ -598,8 +575,8 @@ impl<'a> DocumentsAddition<'a> { words.set_doc_indexes(&word, &set)?; } - for ((id, attr), words) in docs_attrs_words { - attrs_words.set_attr_words(id, attr, &words)?; + for (id, words) in indexed.docs_words { + docs_words.set_doc_words(id, &words)?; } let delta_words = delta_words_builder @@ -656,7 +633,7 @@ impl<'a> DocumentsDeletion<'a> { pub fn finalize(mut self) -> Result<(), Error> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; - let attrs_words = &lease_inner.raw.attrs_words; + let docs_words = &lease_inner.raw.docs_words; let words = &lease_inner.raw.words; let documents = &lease_inner.raw.documents; @@ -666,26 +643,25 @@ impl<'a> DocumentsDeletion<'a> { SetBuf::new_unchecked(self.documents) }; - let mut words_attrs = HashMap::new(); + let mut words_document_ids = HashMap::new(); for id in idset.into_vec() { - for result in attrs_words.attrs_words(id) { - let (attr, words) = result?; + if let Some(words) = docs_words.doc_words(id)? { let mut stream = words.stream(); while let Some(word) = stream.next() { let word = word.to_vec(); - words_attrs.entry(word).or_insert_with(Vec::new).push((id, attr)); + words_document_ids.entry(word).or_insert_with(Vec::new).push(id); } } } let mut removed_words = BTreeSet::new(); - for (word, mut attrs) in words_attrs { - attrs.sort_unstable(); - attrs.dedup(); - let attrs = SetBuf::new_unchecked(attrs); + for (word, mut document_ids) in words_document_ids { + document_ids.sort_unstable(); + document_ids.dedup(); + let document_ids = SetBuf::new_unchecked(document_ids); if let Some(doc_indexes) = words.doc_indexes(&word)? { - let op = DifferenceByKey::new(&doc_indexes, &attrs, |d| d.document_id, |(id, _)| *id); + let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); let doc_indexes = op.into_set_buf(); if !doc_indexes.is_empty() { @@ -696,9 +672,9 @@ impl<'a> DocumentsDeletion<'a> { } } - for (id, attr) in attrs.into_vec() { - documents.del_document_field(id, attr)?; - attrs_words.del_attr_words(id, attr)?; + for id in document_ids.into_vec() { + documents.del_all_document_fields(id)?; + docs_words.del_doc_words(id)?; } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 1aa3095c8..708aac14d 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -13,12 +13,12 @@ type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, - docs_attrs_words: HashMap<(DocumentId, SchemaAttr), Vec>, + docs_words: HashMap>, } pub struct Indexed { pub words_doc_indexes: BTreeMap>, - pub docs_attrs_words: HashMap<(DocumentId, SchemaAttr), fst::Set>, + pub docs_words: HashMap, } impl Indexer { @@ -30,7 +30,7 @@ impl Indexer { Indexer { word_limit: limit, words_doc_indexes: BTreeMap::new(), - docs_attrs_words: HashMap::new(), + docs_words: HashMap::new(), } } @@ -42,7 +42,7 @@ impl Indexer { attr, self.word_limit, &mut self.words_doc_indexes, - &mut self.docs_attrs_words, + &mut self.docs_words, ); if !must_continue { break } @@ -60,7 +60,7 @@ impl Indexer { attr, self.word_limit, &mut self.words_doc_indexes, - &mut self.docs_attrs_words, + &mut self.docs_words, ); if !must_continue { break } @@ -76,16 +76,16 @@ impl Indexer { (word, SetBuf::new_unchecked(indexes)) }).collect(); - let docs_attrs_words = self.docs_attrs_words + let docs_words = self.docs_words .into_iter() - .map(|((id, attr), mut words)| { + .map(|(id, mut words)| { words.sort_unstable(); words.dedup(); - ((id, attr), fst::Set::from_iter(words).unwrap()) + (id, fst::Set::from_iter(words).unwrap()) }) .collect(); - Indexed { words_doc_indexes, docs_attrs_words } + Indexed { words_doc_indexes, docs_words } } } @@ -95,7 +95,7 @@ fn index_token( attr: SchemaAttr, word_limit: usize, words_doc_indexes: &mut BTreeMap>, - docs_attrs_words: &mut HashMap<(DocumentId, SchemaAttr), Vec>, + docs_words: &mut HashMap>, ) -> bool { if token.word_index >= word_limit { return false } @@ -106,7 +106,7 @@ fn index_token( Some(docindex) => { let word = Vec::from(token.word); words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); - docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); + docs_words.entry(id).or_insert_with(Vec::new).push(word); }, None => return false, } @@ -119,7 +119,7 @@ fn index_token( Some(docindex) => { let word = Vec::from(token.word); words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); - docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); + docs_words.entry(id).or_insert_with(Vec::new).push(word); }, None => return false, } From 2e31bb519a2bf9bdd650277ce14f0235a2202c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 11:36:44 +0200 Subject: [PATCH 20/32] chore: Split the database structure internal types --- meilidb-data/src/database.rs | 719 ------------------ meilidb-data/src/database/docs_words_index.rs | 33 + .../src/database/documents_addition.rs | 131 ++++ .../src/database/documents_deletion.rs | 110 +++ meilidb-data/src/database/documents_index.rs | 71 ++ meilidb-data/src/database/error.rs | 57 ++ meilidb-data/src/database/index.rs | 121 +++ meilidb-data/src/database/main_index.rs | 62 ++ meilidb-data/src/database/mod.rs | 161 ++++ meilidb-data/src/database/raw_index.rs | 9 + meilidb-data/src/database/words_index.rs | 32 + 11 files changed, 787 insertions(+), 719 deletions(-) delete mode 100644 meilidb-data/src/database.rs create mode 100644 meilidb-data/src/database/docs_words_index.rs create mode 100644 meilidb-data/src/database/documents_addition.rs create mode 100644 meilidb-data/src/database/documents_deletion.rs create mode 100644 meilidb-data/src/database/documents_index.rs create mode 100644 meilidb-data/src/database/error.rs create mode 100644 meilidb-data/src/database/index.rs create mode 100644 meilidb-data/src/database/main_index.rs create mode 100644 meilidb-data/src/database/mod.rs create mode 100644 meilidb-data/src/database/raw_index.rs create mode 100644 meilidb-data/src/database/words_index.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs deleted file mode 100644 index badbc2bf8..000000000 --- a/meilidb-data/src/database.rs +++ /dev/null @@ -1,719 +0,0 @@ -use std::collections::hash_map::Entry; -use std::collections::{BTreeSet, HashSet, HashMap}; -use std::convert::TryInto; -use std::path::Path; -use std::sync::{Arc, RwLock}; -use std::{error, fmt}; - -use arc_swap::{ArcSwap, Lease}; -use meilidb_core::{criterion::Criteria, QueryBuilder, Store, DocumentId, DocIndex}; -use rmp_serde::decode::{Error as RmpError}; -use sdset::{Set, SetBuf, SetOperation, duo::{Union, DifferenceByKey}}; -use serde::de; -use sled::IVec; -use zerocopy::{AsBytes, LayoutVerified}; -use fst::{SetBuilder, set::OpBuilder, Streamer}; - -use crate::document_attr_key::DocumentAttrKey; -use crate::indexer::Indexer; -use crate::serde::extract_document_id; -use crate::serde::{Serializer, RamDocumentStore, Deserializer, SerializerError}; -use crate::{Schema, SchemaAttr, RankedMap}; - -#[derive(Debug)] -pub enum Error { - SchemaDiffer, - SchemaMissing, - WordIndexMissing, - MissingDocumentId, - SledError(sled::Error), - FstError(fst::Error), - BincodeError(bincode::Error), - SerializerError(SerializerError), -} - -impl From for Error { - fn from(error: sled::Error) -> Error { - Error::SledError(error) - } -} - -impl From for Error { - fn from(error: fst::Error) -> Error { - Error::FstError(error) - } -} - -impl From for Error { - fn from(error: bincode::Error) -> Error { - Error::BincodeError(error) - } -} - -impl From for Error { - fn from(error: SerializerError) -> Error { - Error::SerializerError(error) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - match self { - SchemaDiffer => write!(f, "schemas differ"), - SchemaMissing => write!(f, "this index does not have a schema"), - WordIndexMissing => write!(f, "this index does not have a word index"), - MissingDocumentId => write!(f, "document id is missing"), - SledError(e) => write!(f, "sled error; {}", e), - FstError(e) => write!(f, "fst error; {}", e), - BincodeError(e) => write!(f, "bincode error; {}", e), - SerializerError(e) => write!(f, "serializer error; {}", e), - } - } -} - -impl error::Error for Error { } - -pub struct Database { - cache: RwLock>>, - inner: sled::Db, -} - -impl Database { - pub fn start_default>(path: P) -> Result { - let cache = RwLock::new(HashMap::new()); - let inner = sled::Db::start_default(path)?; - Ok(Database { cache, inner }) - } - - pub fn indexes(&self) -> Result>, Error> { - let bytes = match self.inner.get("indexes")? { - Some(bytes) => bytes, - None => return Ok(None), - }; - - let indexes = bincode::deserialize(&bytes)?; - Ok(Some(indexes)) - } - - pub fn set_indexes(&self, value: &HashSet) -> Result<(), Error> { - let bytes = bincode::serialize(value)?; - self.inner.set("indexes", bytes)?; - Ok(()) - } - - pub fn open_index(&self, name: &str) -> Result>, Error> { - { - let cache = self.cache.read().unwrap(); - if let Some(index) = cache.get(name).cloned() { - return Ok(Some(index)) - } - } - - let mut cache = self.cache.write().unwrap(); - let index = match cache.entry(name.to_string()) { - Entry::Occupied(occupied) => { - occupied.get().clone() - }, - Entry::Vacant(vacant) => { - if !self.indexes()?.map_or(false, |x| x.contains(name)) { - return Ok(None) - } - - let main = { - let tree = self.inner.open_tree(name)?; - MainIndex(tree) - }; - - let words = { - let tree_name = format!("{}-words", name); - let tree = self.inner.open_tree(tree_name)?; - WordsIndex(tree) - }; - - let docs_words = { - let tree_name = format!("{}-docs-words", name); - let tree = self.inner.open_tree(tree_name)?; - DocsWords(tree) - }; - - let documents = { - let tree_name = format!("{}-documents", name); - let tree = self.inner.open_tree(tree_name)?; - DocumentsIndex(tree) - }; - - let raw_index = RawIndex { main, words, docs_words, documents }; - let index = Index::from_raw(raw_index)?; - - vacant.insert(Arc::new(index)).clone() - }, - }; - - Ok(Some(index)) - } - - pub fn create_index(&self, name: &str, schema: Schema) -> Result, Error> { - let mut cache = self.cache.write().unwrap(); - - let index = match cache.entry(name.to_string()) { - Entry::Occupied(occupied) => { - occupied.get().clone() - }, - Entry::Vacant(vacant) => { - let main = { - let tree = self.inner.open_tree(name)?; - MainIndex(tree) - }; - - if let Some(prev_schema) = main.schema()? { - if prev_schema != schema { - return Err(Error::SchemaDiffer) - } - } - - main.set_schema(&schema)?; - - let words = { - let tree_name = format!("{}-words", name); - let tree = self.inner.open_tree(tree_name)?; - WordsIndex(tree) - }; - - let docs_words = { - let tree_name = format!("{}-docs-words", name); - let tree = self.inner.open_tree(tree_name)?; - DocsWords(tree) - }; - - let documents = { - let tree_name = format!("{}-documents", name); - let tree = self.inner.open_tree(tree_name)?; - DocumentsIndex(tree) - }; - - let mut indexes = self.indexes()?.unwrap_or_else(HashSet::new); - indexes.insert(name.to_string()); - self.set_indexes(&indexes)?; - - let raw_index = RawIndex { main, words, docs_words, documents }; - let index = Index::from_raw(raw_index)?; - - vacant.insert(Arc::new(index)).clone() - }, - }; - - Ok(index) - } -} - -#[derive(Clone)] -pub struct RawIndex { - pub main: MainIndex, - pub words: WordsIndex, - pub docs_words: DocsWords, - pub documents: DocumentsIndex, -} - -#[derive(Clone)] -pub struct MainIndex(Arc); - -impl MainIndex { - pub fn schema(&self) -> Result, Error> { - match self.0.get("schema")? { - Some(bytes) => { - let schema = Schema::read_from_bin(bytes.as_ref())?; - Ok(Some(schema)) - }, - None => Ok(None), - } - } - - pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> { - let mut bytes = Vec::new(); - schema.write_to_bin(&mut bytes)?; - self.0.set("schema", bytes)?; - Ok(()) - } - - pub fn words_set(&self) -> Result, Error> { - match self.0.get("words")? { - Some(bytes) => { - let len = bytes.len(); - let value = bytes.into(); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None), - } - } - - pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> { - self.0.set("words", value.as_fst().as_bytes())?; - Ok(()) - } - - pub fn ranked_map(&self) -> Result, Error> { - match self.0.get("ranked-map")? { - Some(bytes) => { - let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?; - Ok(Some(ranked_map)) - }, - None => Ok(None), - } - } - - pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> { - let mut bytes = Vec::new(); - value.write_to_bin(&mut bytes)?; - self.0.set("ranked_map", bytes)?; - Ok(()) - } -} - -#[derive(Clone)] -pub struct WordsIndex(Arc); - -impl WordsIndex { - pub fn doc_indexes(&self, word: &[u8]) -> sled::Result>> { - match self.0.get(word)? { - Some(bytes) => { - let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout"); - let slice = layout.into_slice(); - let setbuf = SetBuf::new_unchecked(slice.to_vec()); - Ok(Some(setbuf)) - }, - None => Ok(None), - } - } - - pub fn set_doc_indexes(&self, word: &[u8], set: &Set) -> sled::Result<()> { - self.0.set(word, set.as_bytes())?; - Ok(()) - } - - pub fn del_doc_indexes(&self, word: &[u8]) -> sled::Result<()> { - self.0.del(word)?; - Ok(()) - } -} - -#[derive(Clone)] -pub struct DocsWords(Arc); - -impl DocsWords { - pub fn doc_words(&self, id: DocumentId) -> Result, Error> { - let key = id.0.to_be_bytes(); - match self.0.get(key)? { - Some(bytes) => { - let len = bytes.len(); - let value = bytes.into(); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None) - } - } - - pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> { - let key = id.0.to_be_bytes(); - self.0.set(key, words.as_fst().as_bytes())?; - Ok(()) - } - - pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> { - let key = id.0.to_be_bytes(); - self.0.del(key)?; - Ok(()) - } -} - -#[derive(Clone)] -pub struct DocumentsIndex(Arc); - -impl DocumentsIndex { - pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.get(key) - } - - pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec) -> sled::Result<()> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.set(key, value)?; - Ok(()) - } - - pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result<()> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.del(key)?; - Ok(()) - } - - pub fn del_all_document_fields(&self, id: DocumentId) -> sled::Result<()> { - let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes(); - let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes(); - let document_attrs = self.0.range(start..=end).keys(); - - for key in document_attrs { - self.0.del(key?)?; - } - - Ok(()) - } - - pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { - let start = DocumentAttrKey::new(id, SchemaAttr::min()); - let start = start.to_be_bytes(); - - let end = DocumentAttrKey::new(id, SchemaAttr::max()); - let end = end.to_be_bytes(); - - DocumentFieldsIter(self.0.range(start..=end)) - } -} - -pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); - -impl<'a> Iterator for DocumentFieldsIter<'a> { - type Item = sled::Result<(SchemaAttr, IVec)>; - - fn next(&mut self) -> Option { - match self.0.next() { - Some(Ok((key, value))) => { - let slice: &[u8] = key.as_ref(); - let array = slice.try_into().unwrap(); - let key = DocumentAttrKey::from_be_bytes(array); - Some(Ok((key.attribute, value))) - }, - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -#[derive(Clone)] -pub struct Index(ArcSwap); - -pub struct InnerIndex { - pub words: fst::Set, - pub schema: Schema, - pub ranked_map: RankedMap, - pub raw: RawIndex, // TODO this will be a snapshot in the future -} - -impl Index { - fn from_raw(raw: RawIndex) -> Result { - let words = match raw.main.words_set()? { - Some(words) => words, - None => fst::Set::default(), - }; - - let schema = match raw.main.schema()? { - Some(schema) => schema, - None => return Err(Error::SchemaMissing), - }; - - let ranked_map = match raw.main.ranked_map()? { - Some(map) => map, - None => RankedMap::default(), - }; - - let inner = InnerIndex { words, schema, ranked_map, raw }; - let index = Index(ArcSwap::new(Arc::new(inner))); - - Ok(index) - } - - pub fn query_builder(&self) -> QueryBuilder { - let lease = IndexLease(self.0.lease()); - QueryBuilder::new(lease) - } - - pub fn query_builder_with_criteria<'c>( - &self, - criteria: Criteria<'c>, - ) -> QueryBuilder<'c, IndexLease> - { - let lease = IndexLease(self.0.lease()); - QueryBuilder::with_criteria(lease, criteria) - } - - pub fn lease_inner(&self) -> Lease> { - self.0.lease() - } - - pub fn schema(&self) -> Schema { - self.0.lease().schema.clone() - } - - pub fn documents_addition(&self) -> DocumentsAddition { - let ranked_map = self.0.lease().ranked_map.clone(); - DocumentsAddition::new(self, ranked_map) - } - - pub fn documents_deletion(&self) -> DocumentsDeletion { - DocumentsDeletion::new(self) - } - - pub fn document( - &self, - fields: Option<&HashSet<&str>>, - id: DocumentId, - ) -> Result, RmpError> - where T: de::DeserializeOwned, - { - let schema = &self.lease_inner().schema; - let fields = fields - .map(|fields| { - fields - .into_iter() - .filter_map(|name| schema.attribute(name)) - .collect() - }); - - let mut deserializer = Deserializer { - document_id: id, - index: &self, - fields: fields.as_ref(), - }; - - // TODO: currently we return an error if all document fields are missing, - // returning None would have been better - T::deserialize(&mut deserializer).map(Some) - } -} - -pub struct IndexLease(Lease>); - -impl Store for IndexLease { - type Error = Error; - - fn words(&self) -> Result<&fst::Set, Self::Error> { - Ok(&self.0.words) - } - - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - Ok(self.0.raw.words.doc_indexes(word)?) - } -} - -pub struct DocumentsAddition<'a> { - inner: &'a Index, - document_ids: HashSet, - document_store: RamDocumentStore, - indexer: Indexer, - ranked_map: RankedMap, -} - -impl<'a> DocumentsAddition<'a> { - fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { - DocumentsAddition { - inner, - document_ids: HashSet::new(), - document_store: RamDocumentStore::new(), - indexer: Indexer::new(), - ranked_map, - } - } - - pub fn update_document(&mut self, document: D) -> Result<(), Error> - where D: serde::Serialize, - { - let schema = &self.inner.lease_inner().schema; - let identifier = schema.identifier_name(); - - let document_id = match extract_document_id(identifier, &document)? { - Some(id) => id, - None => return Err(Error::MissingDocumentId), - }; - - // 1. store the document id for future deletion - self.document_ids.insert(document_id); - - // 2. index the document fields in ram stores - let serializer = Serializer { - schema, - document_store: &mut self.document_store, - indexer: &mut self.indexer, - ranked_map: &mut self.ranked_map, - document_id, - }; - - document.serialize(serializer)?; - - Ok(()) - } - - pub fn finalize(self) -> Result<(), Error> { - let lease_inner = self.inner.lease_inner(); - let main = &lease_inner.raw.main; - let words = &lease_inner.raw.words; - let docs_words = &lease_inner.raw.docs_words; - let documents = &lease_inner.raw.documents; - - // 1. remove the previous documents match indexes - let mut documents_deletion = DocumentsDeletion::new(self.inner); - documents_deletion.extend(self.document_ids); - documents_deletion.finalize()?; - - // 2. insert new document attributes in the database - for ((id, attr), value) in self.document_store.into_inner() { - documents.set_document_field(id, attr, value)?; - } - - let indexed = self.indexer.build(); - let mut delta_words_builder = SetBuilder::memory(); - - for (word, delta_set) in indexed.words_doc_indexes { - delta_words_builder.insert(&word).unwrap(); - - let set = match words.doc_indexes(&word)? { - Some(set) => Union::new(&set, &delta_set).into_set_buf(), - None => delta_set, - }; - - words.set_doc_indexes(&word, &set)?; - } - - for (id, words) in indexed.docs_words { - docs_words.set_doc_words(id, &words)?; - } - - let delta_words = delta_words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap(); - - let words = match main.words_set()? { - Some(words) => { - let op = OpBuilder::new() - .add(words.stream()) - .add(delta_words.stream()) - .r#union(); - - let mut words_builder = SetBuilder::memory(); - words_builder.extend_stream(op).unwrap(); - words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - }, - None => delta_words, - }; - - main.set_words_set(&words)?; - main.set_ranked_map(&self.ranked_map)?; - - // update the "consistent" view of the Index - let ranked_map = self.ranked_map; - let schema = lease_inner.schema.clone(); - let raw = lease_inner.raw.clone(); - - let inner = InnerIndex { words, schema, ranked_map, raw }; - self.inner.0.store(Arc::new(inner)); - - Ok(()) - } -} - -pub struct DocumentsDeletion<'a> { - inner: &'a Index, - documents: Vec, -} - -impl<'a> DocumentsDeletion<'a> { - fn new(inner: &'a Index) -> DocumentsDeletion { - DocumentsDeletion { inner, documents: Vec::new() } - } - - pub fn delete_document(&mut self, id: DocumentId) { - self.documents.push(id); - } - - pub fn finalize(mut self) -> Result<(), Error> { - let lease_inner = self.inner.lease_inner(); - let main = &lease_inner.raw.main; - let docs_words = &lease_inner.raw.docs_words; - let words = &lease_inner.raw.words; - let documents = &lease_inner.raw.documents; - - let idset = { - self.documents.sort_unstable(); - self.documents.dedup(); - SetBuf::new_unchecked(self.documents) - }; - - let mut words_document_ids = HashMap::new(); - for id in idset.into_vec() { - if let Some(words) = docs_words.doc_words(id)? { - let mut stream = words.stream(); - while let Some(word) = stream.next() { - let word = word.to_vec(); - words_document_ids.entry(word).or_insert_with(Vec::new).push(id); - } - } - } - - let mut removed_words = BTreeSet::new(); - for (word, mut document_ids) in words_document_ids { - document_ids.sort_unstable(); - document_ids.dedup(); - let document_ids = SetBuf::new_unchecked(document_ids); - - if let Some(doc_indexes) = words.doc_indexes(&word)? { - let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); - let doc_indexes = op.into_set_buf(); - - if !doc_indexes.is_empty() { - words.set_doc_indexes(&word, &doc_indexes)?; - } else { - words.del_doc_indexes(&word)?; - removed_words.insert(word); - } - } - - for id in document_ids.into_vec() { - documents.del_all_document_fields(id)?; - docs_words.del_doc_words(id)?; - } - } - - let removed_words = fst::Set::from_iter(removed_words).unwrap(); - let words = match main.words_set()? { - Some(words_set) => { - let op = fst::set::OpBuilder::new() - .add(words_set.stream()) - .add(removed_words.stream()) - .difference(); - - let mut words_builder = SetBuilder::memory(); - words_builder.extend_stream(op).unwrap(); - words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - }, - None => fst::Set::default(), - }; - - main.set_words_set(&words)?; - - // TODO must update the ranked_map too! - - // update the "consistent" view of the Index - let ranked_map = lease_inner.ranked_map.clone(); - let schema = lease_inner.schema.clone(); - let raw = lease_inner.raw.clone(); - - let inner = InnerIndex { words, schema, ranked_map, raw }; - self.inner.0.store(Arc::new(inner)); - - Ok(()) - } -} - -impl<'a> Extend for DocumentsDeletion<'a> { - fn extend>(&mut self, iter: T) { - self.documents.extend(iter) - } -} diff --git a/meilidb-data/src/database/docs_words_index.rs b/meilidb-data/src/database/docs_words_index.rs new file mode 100644 index 000000000..6b7de15a2 --- /dev/null +++ b/meilidb-data/src/database/docs_words_index.rs @@ -0,0 +1,33 @@ +use std::sync::Arc; +use meilidb_core::DocumentId; +use super::Error; + +#[derive(Clone)] +pub struct DocsWordsIndex(pub Arc); + +impl DocsWordsIndex { + pub fn doc_words(&self, id: DocumentId) -> Result, Error> { + let key = id.0.to_be_bytes(); + match self.0.get(key)? { + Some(bytes) => { + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None) + } + } + + pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> { + let key = id.0.to_be_bytes(); + self.0.set(key, words.as_fst().as_bytes())?; + Ok(()) + } + + pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> { + let key = id.0.to_be_bytes(); + self.0.del(key)?; + Ok(()) + } +} diff --git a/meilidb-data/src/database/documents_addition.rs b/meilidb-data/src/database/documents_addition.rs new file mode 100644 index 000000000..b22c06da6 --- /dev/null +++ b/meilidb-data/src/database/documents_addition.rs @@ -0,0 +1,131 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use meilidb_core::DocumentId; +use fst::{SetBuilder, set::OpBuilder}; +use sdset::{SetOperation, duo::Union}; + +use crate::indexer::Indexer; +use crate::serde::{extract_document_id, Serializer, RamDocumentStore}; +use crate::RankedMap; + +use super::{Error, Index, InnerIndex, DocumentsDeletion}; + +pub struct DocumentsAddition<'a> { + inner: &'a Index, + document_ids: HashSet, + document_store: RamDocumentStore, + indexer: Indexer, + ranked_map: RankedMap, +} + +impl<'a> DocumentsAddition<'a> { + pub(crate) fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { + DocumentsAddition { + inner, + document_ids: HashSet::new(), + document_store: RamDocumentStore::new(), + indexer: Indexer::new(), + ranked_map, + } + } + + pub fn update_document(&mut self, document: D) -> Result<(), Error> + where D: serde::Serialize, + { + let schema = &self.inner.lease_inner().schema; + let identifier = schema.identifier_name(); + + let document_id = match extract_document_id(identifier, &document)? { + Some(id) => id, + None => return Err(Error::MissingDocumentId), + }; + + // 1. store the document id for future deletion + self.document_ids.insert(document_id); + + // 2. index the document fields in ram stores + let serializer = Serializer { + schema, + document_store: &mut self.document_store, + indexer: &mut self.indexer, + ranked_map: &mut self.ranked_map, + document_id, + }; + + document.serialize(serializer)?; + + Ok(()) + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let main = &lease_inner.raw.main; + let words = &lease_inner.raw.words; + let docs_words = &lease_inner.raw.docs_words; + let documents = &lease_inner.raw.documents; + + // 1. remove the previous documents match indexes + let mut documents_deletion = DocumentsDeletion::new(self.inner); + documents_deletion.extend(self.document_ids); + documents_deletion.finalize()?; + + // 2. insert new document attributes in the database + for ((id, attr), value) in self.document_store.into_inner() { + documents.set_document_field(id, attr, value)?; + } + + let indexed = self.indexer.build(); + let mut delta_words_builder = SetBuilder::memory(); + + for (word, delta_set) in indexed.words_doc_indexes { + delta_words_builder.insert(&word).unwrap(); + + let set = match words.doc_indexes(&word)? { + Some(set) => Union::new(&set, &delta_set).into_set_buf(), + None => delta_set, + }; + + words.set_doc_indexes(&word, &set)?; + } + + for (id, words) in indexed.docs_words { + docs_words.set_doc_words(id, &words)?; + } + + let delta_words = delta_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let words = match main.words_set()? { + Some(words) => { + let op = OpBuilder::new() + .add(words.stream()) + .add(delta_words.stream()) + .r#union(); + + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_words, + }; + + main.set_words_set(&words)?; + main.set_ranked_map(&self.ranked_map)?; + + // update the "consistent" view of the Index + let ranked_map = self.ranked_map; + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + + let inner = InnerIndex { words, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs new file mode 100644 index 000000000..5729e3461 --- /dev/null +++ b/meilidb-data/src/database/documents_deletion.rs @@ -0,0 +1,110 @@ +use std::collections::{HashMap, BTreeSet}; +use std::sync::Arc; + +use sdset::{SetBuf, SetOperation, duo::DifferenceByKey}; +use fst::{SetBuilder, Streamer}; +use meilidb_core::DocumentId; + +use super::{Index, Error, InnerIndex}; + +pub struct DocumentsDeletion<'a> { + inner: &'a Index, + documents: Vec, +} + +impl<'a> DocumentsDeletion<'a> { + pub(crate) fn new(inner: &'a Index) -> DocumentsDeletion { + DocumentsDeletion { inner, documents: Vec::new() } + } + + pub fn delete_document(&mut self, id: DocumentId) { + self.documents.push(id); + } + + pub fn finalize(mut self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let main = &lease_inner.raw.main; + let docs_words = &lease_inner.raw.docs_words; + let words = &lease_inner.raw.words; + let documents = &lease_inner.raw.documents; + + let idset = { + self.documents.sort_unstable(); + self.documents.dedup(); + SetBuf::new_unchecked(self.documents) + }; + + let mut words_document_ids = HashMap::new(); + for id in idset.into_vec() { + if let Some(words) = docs_words.doc_words(id)? { + let mut stream = words.stream(); + while let Some(word) = stream.next() { + let word = word.to_vec(); + words_document_ids.entry(word).or_insert_with(Vec::new).push(id); + } + } + } + + let mut removed_words = BTreeSet::new(); + for (word, mut document_ids) in words_document_ids { + document_ids.sort_unstable(); + document_ids.dedup(); + let document_ids = SetBuf::new_unchecked(document_ids); + + if let Some(doc_indexes) = words.doc_indexes(&word)? { + let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); + let doc_indexes = op.into_set_buf(); + + if !doc_indexes.is_empty() { + words.set_doc_indexes(&word, &doc_indexes)?; + } else { + words.del_doc_indexes(&word)?; + removed_words.insert(word); + } + } + + for id in document_ids.into_vec() { + documents.del_all_document_fields(id)?; + docs_words.del_doc_words(id)?; + } + } + + let removed_words = fst::Set::from_iter(removed_words).unwrap(); + let words = match main.words_set()? { + Some(words_set) => { + let op = fst::set::OpBuilder::new() + .add(words_set.stream()) + .add(removed_words.stream()) + .difference(); + + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => fst::Set::default(), + }; + + main.set_words_set(&words)?; + + // TODO must update the ranked_map too! + + // update the "consistent" view of the Index + let ranked_map = lease_inner.ranked_map.clone(); + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + + let inner = InnerIndex { words, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} + +impl<'a> Extend for DocumentsDeletion<'a> { + fn extend>(&mut self, iter: T) { + self.documents.extend(iter) + } +} diff --git a/meilidb-data/src/database/documents_index.rs b/meilidb-data/src/database/documents_index.rs new file mode 100644 index 000000000..36866a638 --- /dev/null +++ b/meilidb-data/src/database/documents_index.rs @@ -0,0 +1,71 @@ +use std::sync::Arc; +use std::convert::TryInto; + +use meilidb_core::DocumentId; +use sled::IVec; + +use crate::document_attr_key::DocumentAttrKey; +use crate::schema::SchemaAttr; + +#[derive(Clone)] +pub struct DocumentsIndex(pub(crate) Arc); + +impl DocumentsIndex { + pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.get(key) + } + + pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec) -> sled::Result<()> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.set(key, value)?; + Ok(()) + } + + pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result<()> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.del(key)?; + Ok(()) + } + + pub fn del_all_document_fields(&self, id: DocumentId) -> sled::Result<()> { + let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes(); + let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes(); + let document_attrs = self.0.range(start..=end).keys(); + + for key in document_attrs { + self.0.del(key?)?; + } + + Ok(()) + } + + pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + let start = DocumentAttrKey::new(id, SchemaAttr::min()); + let start = start.to_be_bytes(); + + let end = DocumentAttrKey::new(id, SchemaAttr::max()); + let end = end.to_be_bytes(); + + DocumentFieldsIter(self.0.range(start..=end)) + } +} + +pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentFieldsIter<'a> { + type Item = sled::Result<(SchemaAttr, IVec)>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, value))) => { + let slice: &[u8] = key.as_ref(); + let array = slice.try_into().unwrap(); + let key = DocumentAttrKey::from_be_bytes(array); + Some(Ok((key.attribute, value))) + }, + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} diff --git a/meilidb-data/src/database/error.rs b/meilidb-data/src/database/error.rs new file mode 100644 index 000000000..3e1b48235 --- /dev/null +++ b/meilidb-data/src/database/error.rs @@ -0,0 +1,57 @@ +use std::{error, fmt}; +use crate::serde::SerializerError; + +#[derive(Debug)] +pub enum Error { + SchemaDiffer, + SchemaMissing, + WordIndexMissing, + MissingDocumentId, + SledError(sled::Error), + FstError(fst::Error), + BincodeError(bincode::Error), + SerializerError(SerializerError), +} + +impl From for Error { + fn from(error: sled::Error) -> Error { + Error::SledError(error) + } +} + +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::FstError(error) + } +} + +impl From for Error { + fn from(error: bincode::Error) -> Error { + Error::BincodeError(error) + } +} + +impl From for Error { + fn from(error: SerializerError) -> Error { + Error::SerializerError(error) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + SchemaDiffer => write!(f, "schemas differ"), + SchemaMissing => write!(f, "this index does not have a schema"), + WordIndexMissing => write!(f, "this index does not have a word index"), + MissingDocumentId => write!(f, "document id is missing"), + SledError(e) => write!(f, "sled error; {}", e), + FstError(e) => write!(f, "fst error; {}", e), + BincodeError(e) => write!(f, "bincode error; {}", e), + SerializerError(e) => write!(f, "serializer error; {}", e), + } + } +} + +impl error::Error for Error { } + diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs new file mode 100644 index 000000000..d4ed26636 --- /dev/null +++ b/meilidb-data/src/database/index.rs @@ -0,0 +1,121 @@ +use sdset::SetBuf; +use std::collections::HashSet; +use std::sync::Arc; + +use arc_swap::{ArcSwap, Lease}; +use meilidb_core::criterion::Criteria; +use meilidb_core::{DocIndex, Store, DocumentId, QueryBuilder}; +use rmp_serde::decode::Error as RmpError; +use serde::de; + +use crate::ranked_map::RankedMap; +use crate::schema::Schema; +use crate::serde::Deserializer; + +use super::{Error, RawIndex, DocumentsAddition, DocumentsDeletion}; + +#[derive(Clone)] +pub struct Index(pub(crate) ArcSwap); + +pub struct InnerIndex { + pub words: fst::Set, + pub schema: Schema, + pub ranked_map: RankedMap, + pub raw: RawIndex, // TODO this will be a snapshot in the future +} + +impl Index { + pub(crate) fn from_raw(raw: RawIndex) -> Result { + let words = match raw.main.words_set()? { + Some(words) => words, + None => fst::Set::default(), + }; + + let schema = match raw.main.schema()? { + Some(schema) => schema, + None => return Err(Error::SchemaMissing), + }; + + let ranked_map = match raw.main.ranked_map()? { + Some(map) => map, + None => RankedMap::default(), + }; + + let inner = InnerIndex { words, schema, ranked_map, raw }; + let index = Index(ArcSwap::new(Arc::new(inner))); + + Ok(index) + } + + pub fn query_builder(&self) -> QueryBuilder { + let lease = IndexLease(self.0.lease()); + QueryBuilder::new(lease) + } + + pub fn query_builder_with_criteria<'c>( + &self, + criteria: Criteria<'c>, + ) -> QueryBuilder<'c, IndexLease> + { + let lease = IndexLease(self.0.lease()); + QueryBuilder::with_criteria(lease, criteria) + } + + pub fn lease_inner(&self) -> Lease> { + self.0.lease() + } + + pub fn schema(&self) -> Schema { + self.0.lease().schema.clone() + } + + pub fn documents_addition(&self) -> DocumentsAddition { + let ranked_map = self.0.lease().ranked_map.clone(); + DocumentsAddition::new(self, ranked_map) + } + + pub fn documents_deletion(&self) -> DocumentsDeletion { + DocumentsDeletion::new(self) + } + + pub fn document( + &self, + fields: Option<&HashSet<&str>>, + id: DocumentId, + ) -> Result, RmpError> + where T: de::DeserializeOwned, + { + let schema = &self.lease_inner().schema; + let fields = fields + .map(|fields| { + fields + .into_iter() + .filter_map(|name| schema.attribute(name)) + .collect() + }); + + let mut deserializer = Deserializer { + document_id: id, + index: &self, + fields: fields.as_ref(), + }; + + // TODO: currently we return an error if all document fields are missing, + // returning None would have been better + T::deserialize(&mut deserializer).map(Some) + } +} + +pub struct IndexLease(Lease>); + +impl Store for IndexLease { + type Error = Error; + + fn words(&self) -> Result<&fst::Set, Self::Error> { + Ok(&self.0.words) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.0.raw.words.doc_indexes(word)?) + } +} diff --git a/meilidb-data/src/database/main_index.rs b/meilidb-data/src/database/main_index.rs new file mode 100644 index 000000000..f8d389774 --- /dev/null +++ b/meilidb-data/src/database/main_index.rs @@ -0,0 +1,62 @@ +use std::sync::Arc; + +use crate::ranked_map::RankedMap; +use crate::schema::Schema; + +use super::Error; + +#[derive(Clone)] +pub struct MainIndex(pub(crate) Arc); + +impl MainIndex { + pub fn schema(&self) -> Result, Error> { + match self.0.get("schema")? { + Some(bytes) => { + let schema = Schema::read_from_bin(bytes.as_ref())?; + Ok(Some(schema)) + }, + None => Ok(None), + } + } + + pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> { + let mut bytes = Vec::new(); + schema.write_to_bin(&mut bytes)?; + self.0.set("schema", bytes)?; + Ok(()) + } + + pub fn words_set(&self) -> Result, Error> { + match self.0.get("words")? { + Some(bytes) => { + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> { + self.0.set("words", value.as_fst().as_bytes())?; + Ok(()) + } + + pub fn ranked_map(&self) -> Result, Error> { + match self.0.get("ranked-map")? { + Some(bytes) => { + let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?; + Ok(Some(ranked_map)) + }, + None => Ok(None), + } + } + + pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> { + let mut bytes = Vec::new(); + value.write_to_bin(&mut bytes)?; + self.0.set("ranked_map", bytes)?; + Ok(()) + } +} diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs new file mode 100644 index 000000000..5f415fb1e --- /dev/null +++ b/meilidb-data/src/database/mod.rs @@ -0,0 +1,161 @@ +use std::collections::hash_map::Entry; +use std::collections::{HashSet, HashMap}; +use std::path::Path; +use std::sync::{Arc, RwLock}; + +use crate::Schema; + +mod docs_words_index; +mod documents_addition; +mod documents_deletion; +mod documents_index; +mod error; +mod index; +mod main_index; +mod raw_index; +mod words_index; + +pub use self::error::Error; +pub use self::index::Index; + +use self::docs_words_index::DocsWordsIndex; +use self::documents_addition::DocumentsAddition; +use self::documents_deletion::DocumentsDeletion; +use self::documents_index::DocumentsIndex; +use self::index::InnerIndex; +use self::main_index::MainIndex; +use self::raw_index::RawIndex; +use self::words_index::WordsIndex; + +pub struct Database { + cache: RwLock>>, + inner: sled::Db, +} + +impl Database { + pub fn start_default>(path: P) -> Result { + let cache = RwLock::new(HashMap::new()); + let inner = sled::Db::start_default(path)?; + Ok(Database { cache, inner }) + } + + pub fn indexes(&self) -> Result>, Error> { + let bytes = match self.inner.get("indexes")? { + Some(bytes) => bytes, + None => return Ok(None), + }; + + let indexes = bincode::deserialize(&bytes)?; + Ok(Some(indexes)) + } + + fn set_indexes(&self, value: &HashSet) -> Result<(), Error> { + let bytes = bincode::serialize(value)?; + self.inner.set("indexes", bytes)?; + Ok(()) + } + + pub fn open_index(&self, name: &str) -> Result>, Error> { + { + let cache = self.cache.read().unwrap(); + if let Some(index) = cache.get(name).cloned() { + return Ok(Some(index)) + } + } + + let mut cache = self.cache.write().unwrap(); + let index = match cache.entry(name.to_string()) { + Entry::Occupied(occupied) => { + occupied.get().clone() + }, + Entry::Vacant(vacant) => { + if !self.indexes()?.map_or(false, |x| x.contains(name)) { + return Ok(None) + } + + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let docs_words = { + let tree_name = format!("{}-docs-words", name); + let tree = self.inner.open_tree(tree_name)?; + DocsWordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let raw_index = RawIndex { main, words, docs_words, documents }; + let index = Index::from_raw(raw_index)?; + + vacant.insert(Arc::new(index)).clone() + }, + }; + + Ok(Some(index)) + } + + pub fn create_index(&self, name: &str, schema: Schema) -> Result, Error> { + let mut cache = self.cache.write().unwrap(); + + let index = match cache.entry(name.to_string()) { + Entry::Occupied(occupied) => { + occupied.get().clone() + }, + Entry::Vacant(vacant) => { + let main = { + let tree = self.inner.open_tree(name)?; + MainIndex(tree) + }; + + if let Some(prev_schema) = main.schema()? { + if prev_schema != schema { + return Err(Error::SchemaDiffer) + } + } + + main.set_schema(&schema)?; + + let words = { + let tree_name = format!("{}-words", name); + let tree = self.inner.open_tree(tree_name)?; + WordsIndex(tree) + }; + + let docs_words = { + let tree_name = format!("{}-docs-words", name); + let tree = self.inner.open_tree(tree_name)?; + DocsWordsIndex(tree) + }; + + let documents = { + let tree_name = format!("{}-documents", name); + let tree = self.inner.open_tree(tree_name)?; + DocumentsIndex(tree) + }; + + let mut indexes = self.indexes()?.unwrap_or_else(HashSet::new); + indexes.insert(name.to_string()); + self.set_indexes(&indexes)?; + + let raw_index = RawIndex { main, words, docs_words, documents }; + let index = Index::from_raw(raw_index)?; + + vacant.insert(Arc::new(index)).clone() + }, + }; + + Ok(index) + } +} diff --git a/meilidb-data/src/database/raw_index.rs b/meilidb-data/src/database/raw_index.rs new file mode 100644 index 000000000..0b2a56dbd --- /dev/null +++ b/meilidb-data/src/database/raw_index.rs @@ -0,0 +1,9 @@ +use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex}; + +#[derive(Clone)] +pub struct RawIndex { + pub main: MainIndex, + pub words: WordsIndex, + pub docs_words: DocsWordsIndex, + pub documents: DocumentsIndex, +} diff --git a/meilidb-data/src/database/words_index.rs b/meilidb-data/src/database/words_index.rs new file mode 100644 index 000000000..398e73900 --- /dev/null +++ b/meilidb-data/src/database/words_index.rs @@ -0,0 +1,32 @@ +use std::sync::Arc; + +use meilidb_core::DocIndex; +use sdset::{Set, SetBuf}; +use zerocopy::{LayoutVerified, AsBytes}; + +#[derive(Clone)] +pub struct WordsIndex(pub(crate) Arc); + +impl WordsIndex { + pub fn doc_indexes(&self, word: &[u8]) -> sled::Result>> { + match self.0.get(word)? { + Some(bytes) => { + let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout"); + let slice = layout.into_slice(); + let setbuf = SetBuf::new_unchecked(slice.to_vec()); + Ok(Some(setbuf)) + }, + None => Ok(None), + } + } + + pub fn set_doc_indexes(&self, word: &[u8], set: &Set) -> sled::Result<()> { + self.0.set(word, set.as_bytes())?; + Ok(()) + } + + pub fn del_doc_indexes(&self, word: &[u8]) -> sled::Result<()> { + self.0.del(word)?; + Ok(()) + } +} From c5ba34d0b0643a5fa413174318e7fa47a74d62a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 11:39:29 +0200 Subject: [PATCH 21/32] chore: Replace crate only public interface to be completely public --- meilidb-data/src/database/documents_addition.rs | 2 +- meilidb-data/src/database/documents_deletion.rs | 2 +- meilidb-data/src/database/documents_index.rs | 2 +- meilidb-data/src/database/index.rs | 4 ++-- meilidb-data/src/database/main_index.rs | 2 +- meilidb-data/src/database/words_index.rs | 2 +- meilidb-data/src/schema.rs | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/meilidb-data/src/database/documents_addition.rs b/meilidb-data/src/database/documents_addition.rs index b22c06da6..a65334eb2 100644 --- a/meilidb-data/src/database/documents_addition.rs +++ b/meilidb-data/src/database/documents_addition.rs @@ -20,7 +20,7 @@ pub struct DocumentsAddition<'a> { } impl<'a> DocumentsAddition<'a> { - pub(crate) fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { + pub fn new(inner: &'a Index, ranked_map: RankedMap) -> DocumentsAddition<'a> { DocumentsAddition { inner, document_ids: HashSet::new(), diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs index 5729e3461..775b8d7f6 100644 --- a/meilidb-data/src/database/documents_deletion.rs +++ b/meilidb-data/src/database/documents_deletion.rs @@ -13,7 +13,7 @@ pub struct DocumentsDeletion<'a> { } impl<'a> DocumentsDeletion<'a> { - pub(crate) fn new(inner: &'a Index) -> DocumentsDeletion { + pub fn new(inner: &'a Index) -> DocumentsDeletion { DocumentsDeletion { inner, documents: Vec::new() } } diff --git a/meilidb-data/src/database/documents_index.rs b/meilidb-data/src/database/documents_index.rs index 36866a638..5fd276bb3 100644 --- a/meilidb-data/src/database/documents_index.rs +++ b/meilidb-data/src/database/documents_index.rs @@ -8,7 +8,7 @@ use crate::document_attr_key::DocumentAttrKey; use crate::schema::SchemaAttr; #[derive(Clone)] -pub struct DocumentsIndex(pub(crate) Arc); +pub struct DocumentsIndex(pub Arc); impl DocumentsIndex { pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result> { diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs index d4ed26636..c534cd33d 100644 --- a/meilidb-data/src/database/index.rs +++ b/meilidb-data/src/database/index.rs @@ -15,7 +15,7 @@ use crate::serde::Deserializer; use super::{Error, RawIndex, DocumentsAddition, DocumentsDeletion}; #[derive(Clone)] -pub struct Index(pub(crate) ArcSwap); +pub struct Index(pub ArcSwap); pub struct InnerIndex { pub words: fst::Set, @@ -25,7 +25,7 @@ pub struct InnerIndex { } impl Index { - pub(crate) fn from_raw(raw: RawIndex) -> Result { + pub fn from_raw(raw: RawIndex) -> Result { let words = match raw.main.words_set()? { Some(words) => words, None => fst::Set::default(), diff --git a/meilidb-data/src/database/main_index.rs b/meilidb-data/src/database/main_index.rs index f8d389774..b1d8edc81 100644 --- a/meilidb-data/src/database/main_index.rs +++ b/meilidb-data/src/database/main_index.rs @@ -6,7 +6,7 @@ use crate::schema::Schema; use super::Error; #[derive(Clone)] -pub struct MainIndex(pub(crate) Arc); +pub struct MainIndex(pub Arc); impl MainIndex { pub fn schema(&self) -> Result, Error> { diff --git a/meilidb-data/src/database/words_index.rs b/meilidb-data/src/database/words_index.rs index 398e73900..3b2598186 100644 --- a/meilidb-data/src/database/words_index.rs +++ b/meilidb-data/src/database/words_index.rs @@ -5,7 +5,7 @@ use sdset::{Set, SetBuf}; use zerocopy::{LayoutVerified, AsBytes}; #[derive(Clone)] -pub struct WordsIndex(pub(crate) Arc); +pub struct WordsIndex(pub Arc); impl WordsIndex { pub fn doc_indexes(&self, word: &[u8]) -> sled::Result>> { diff --git a/meilidb-data/src/schema.rs b/meilidb-data/src/schema.rs index cc39cf0e8..bd1c853dc 100644 --- a/meilidb-data/src/schema.rs +++ b/meilidb-data/src/schema.rs @@ -134,12 +134,12 @@ impl Schema { Ok(()) } - pub(crate) fn read_from_bin(reader: R) -> bincode::Result { + pub fn read_from_bin(reader: R) -> bincode::Result { let builder: SchemaBuilder = bincode::deserialize_from(reader)?; Ok(builder.build()) } - pub(crate) fn write_to_bin(&self, writer: W) -> bincode::Result<()> { + pub fn write_to_bin(&self, writer: W) -> bincode::Result<()> { let identifier = self.inner.identifier.clone(); let attributes = self.attributes_ordered(); let builder = SchemaBuilder { identifier, attributes }; From 830d2f28b9ee74305d6f5034680b793bf6f26e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 12:01:08 +0200 Subject: [PATCH 22/32] feat: Introduce a custom tree for user custom settings --- meilidb-data/src/database/custom_settings.rs | 13 +++++++++++++ meilidb-data/src/database/index.rs | 7 ++++++- meilidb-data/src/database/mod.rs | 18 ++++++++++++++++-- meilidb-data/src/database/raw_index.rs | 3 ++- meilidb-data/src/lib.rs | 2 +- 5 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 meilidb-data/src/database/custom_settings.rs diff --git a/meilidb-data/src/database/custom_settings.rs b/meilidb-data/src/database/custom_settings.rs new file mode 100644 index 000000000..565151aaa --- /dev/null +++ b/meilidb-data/src/database/custom_settings.rs @@ -0,0 +1,13 @@ +use std::sync::Arc; +use std::ops::Deref; + +#[derive(Clone)] +pub struct CustomSettings(pub Arc); + +impl Deref for CustomSettings { + type Target = sled::Tree; + + fn deref(&self) -> &sled::Tree { + &self.0 + } +} diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs index c534cd33d..2b981f5f1 100644 --- a/meilidb-data/src/database/index.rs +++ b/meilidb-data/src/database/index.rs @@ -12,7 +12,8 @@ use crate::ranked_map::RankedMap; use crate::schema::Schema; use crate::serde::Deserializer; -use super::{Error, RawIndex, DocumentsAddition, DocumentsDeletion}; +use super::{Error, CustomSettings}; +use super::{RawIndex, DocumentsAddition, DocumentsDeletion}; #[derive(Clone)] pub struct Index(pub ArcSwap); @@ -69,6 +70,10 @@ impl Index { self.0.lease().schema.clone() } + pub fn custom_settings(&self) -> CustomSettings { + self.0.lease().raw.custom.clone() + } + pub fn documents_addition(&self) -> DocumentsAddition { let ranked_map = self.0.lease().ranked_map.clone(); DocumentsAddition::new(self, ranked_map) diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs index 5f415fb1e..dcdc13bd2 100644 --- a/meilidb-data/src/database/mod.rs +++ b/meilidb-data/src/database/mod.rs @@ -5,6 +5,7 @@ use std::sync::{Arc, RwLock}; use crate::Schema; +mod custom_settings; mod docs_words_index; mod documents_addition; mod documents_deletion; @@ -17,6 +18,7 @@ mod words_index; pub use self::error::Error; pub use self::index::Index; +pub use self::custom_settings::CustomSettings; use self::docs_words_index::DocsWordsIndex; use self::documents_addition::DocumentsAddition; @@ -96,7 +98,13 @@ impl Database { DocumentsIndex(tree) }; - let raw_index = RawIndex { main, words, docs_words, documents }; + let custom = { + let tree_name = format!("{}-custom", name); + let tree = self.inner.open_tree(tree_name)?; + CustomSettings(tree) + }; + + let raw_index = RawIndex { main, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -145,11 +153,17 @@ impl Database { DocumentsIndex(tree) }; + let custom = { + let tree_name = format!("{}-custom", name); + let tree = self.inner.open_tree(tree_name)?; + CustomSettings(tree) + }; + let mut indexes = self.indexes()?.unwrap_or_else(HashSet::new); indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, docs_words, documents }; + let raw_index = RawIndex { main, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() diff --git a/meilidb-data/src/database/raw_index.rs b/meilidb-data/src/database/raw_index.rs index 0b2a56dbd..ada0fd357 100644 --- a/meilidb-data/src/database/raw_index.rs +++ b/meilidb-data/src/database/raw_index.rs @@ -1,4 +1,4 @@ -use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex}; +use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; #[derive(Clone)] pub struct RawIndex { @@ -6,4 +6,5 @@ pub struct RawIndex { pub words: WordsIndex, pub docs_words: DocsWordsIndex, pub documents: DocumentsIndex, + pub custom: CustomSettings, } diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 149a65fa8..22d58fe08 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -6,7 +6,7 @@ mod ranked_map; mod serde; pub mod schema; -pub use self::database::{Database, Index}; +pub use self::database::{Database, Index, CustomSettings}; pub use self::number::Number; pub use self::ranked_map::RankedMap; pub use self::schema::{Schema, SchemaAttr}; From 4e80378a7793618702e470a0c56d2494b9f68414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 15:45:08 +0200 Subject: [PATCH 23/32] chore: Rename the ebay example into kaggle --- examples/ebay/kaggle.csv | 122 ----------------------------- examples/kaggle/schema-kaggle.toml | 19 +++++ 2 files changed, 19 insertions(+), 122 deletions(-) delete mode 100644 examples/ebay/kaggle.csv create mode 100644 examples/kaggle/schema-kaggle.toml diff --git a/examples/ebay/kaggle.csv b/examples/ebay/kaggle.csv deleted file mode 100644 index 6bf7a70d3..000000000 --- a/examples/ebay/kaggle.csv +++ /dev/null @@ -1,122 +0,0 @@ -id,title,description,image -711158459,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158460,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSyIS3vUasKIJEBy4Cw.jpg -711158461,Sony PlayStation 4 PS4 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/m10NZXArmiIkpkTDDkAUVvA.jpg -711158462,Sony - PlayStation 4 500GB The Last of Us Remastered Bundle - Black,,http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmAE8WZDH1l_E_PPAkg.jpg -711158463,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158464,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158465,BRAND NEW Sony PlayStation 4 BUNDLE 500gb,,http://thumbs4.ebaystatic.com/d/l225/m/m9TQTiWcWig7SeQh9algLZg.jpg -711158466,"Sony PlayStation 4 500GB, Dualshock Wireless Control, HDMI Gaming Console Refurb","The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/mTZYG5N6xWfBi4Ok03HmpMw.jpg -711158467,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console w/ 2 Controllers,,http://thumbs2.ebaystatic.com/d/l225/m/mX5Qphrygqeoi7tAH5eku2A.jpg -711158468,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console *NEW*,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/mGjN4IrJ0O8kKD_TYMWgGgQ.jpg -711158469,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console..wth Mortal Kombat X,,http://thumbs2.ebaystatic.com/d/l225/m/mrpqSNXwlnUVKnEscE4348w.jpg -711158470,Genuine SONY PS4 Playstation 4 500GB Gaming Console - Black,,http://thumbs4.ebaystatic.com/d/l225/m/myrPBFCpb4H5rHI8NyiS2zA.jpg -711158471,[Sony] Playstation 4 PS4 Video Game Console Black - Latest Model,,http://thumbs4.ebaystatic.com/d/l225/m/mce0c7mCuv3xpjllJXx093w.jpg -711158472,Sony PlayStation 4 (Latest Model) 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/miVSA1xPO5fCNdYzEMc8rSQ.jpg -711158473,Sony PlayStation 4 - 500 GB Jet Black Console - WITH LAST OF US REMASTERED,,http://thumbs2.ebaystatic.com/d/l225/m/mLjnOxv2GWkrkCtgsDGhJ6A.jpg -711158474,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs3.ebaystatic.com/d/l225/m/mjMittBaXmm_n4AMpETBXhQ.jpg -711158475,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/m1n1qrJ7-VGbe7xQvGdeD6Q.jpg -711158476,"Sony PlayStation 4 - 500 GB Jet Black Console (3 controllers,3 games included)",,http://thumbs3.ebaystatic.com/d/l225/m/mIoGIj9FZG7HoEVkPlnyizA.jpg -711158477,Sony PlayStation 4 500GB Console with 2 Controllers,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/m4fuJ5Ibrj450-TZ83FAkIQ.jpg -711158478,Sony - PlayStation 4 500GB The Last of Us Remastered Bundle - Black,,http://thumbs3.ebaystatic.com/d/l225/m/mzXSIw8Hlnff8IjXJQrXJSw.jpg -711158479,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/m-9S63CgFoUijY3ZTyNs3KA.jpg -711158480,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs1.ebaystatic.com/d/l225/m/mdF9Bisg9wXjv_R9Y_13MWw.jpg -711158481,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console*,,http://thumbs1.ebaystatic.com/d/l225/m/m4_OQHMmIOCa8uEkBepRR5A.jpg -711158482,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/mZ0nR8iz-QAfLssJZMp3L5Q.jpg -711158483,[Sony] Playstation 4 PS4 1105A Video Game Console 500GB White - Latest Model,,http://thumbs4.ebaystatic.com/d/l225/m/m8iTz5cLQLNjD9D3O2jT3IQ.jpg -711158484,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs2.ebaystatic.com/d/l225/m/mrraWCpvP5YKk5rYgotVDLg.jpg -711158485,Obagi Elastiderm Eye Treatment Cream 0.5 oz / 15g Authentic NiB Sealed [5],,http://thumbs1.ebaystatic.com/d/l225/m/mJ4ekz6_bDT5G7wYtjM-qRg.jpg -711158486,Lancome Renergie Eye Anti-Wrinkle & Firming Eye Cream 0.5oz New,,http://thumbs2.ebaystatic.com/d/l225/m/mxwwyDQraZ-TEtr_Y6qRi7Q.jpg -711158487,OZ Naturals - The BEST Eye Gel - Eye Cream For Dark Circles Puffiness and,,http://thumbs2.ebaystatic.com/d/l225/m/mk2Z-hX5sT4kUxfG6g_KFpg.jpg -711158488,Elastiderm Eye Cream (0.5oz/15g),,http://thumbs3.ebaystatic.com/d/l225/m/mHxb5WUc5MtGzCT2UXgY_hg.jpg -711158489,new CLINIQUE Repairwear Laser Focus Wrinkle Correcting Eye Cream 0.17 oz/ 5 ml,,http://thumbs1.ebaystatic.com/d/l225/m/mQSX2wfrSeGy3uA8Q4SbOKw.jpg -711158490,NIB Full Size Dermalogica Multivitamin Power Firm Eye Cream,,http://thumbs4.ebaystatic.com/d/l225/m/m2hxo12e5NjXgGiKIaCvTLA.jpg -711158491,24K Gold Collagen Anti-Dark Circles Anti-Aging Bio Essence Repairing Eye Cream,,http://thumbs4.ebaystatic.com/d/l225/m/mt96efUK5cPAe60B9aGmgMA.jpg -711158492,Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream Full Size .5oz 15mL,,http://thumbs3.ebaystatic.com/d/l225/m/mZyV3wKejCMx9RrnC8X-eMw.jpg -711158493,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs4.ebaystatic.com/d/l225/m/m9hX_z_DFnbNCTh0VFv3KcQ.jpg -711158494,3 Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream .17 oz/5 ml Each,,http://thumbs1.ebaystatic.com/d/l225/m/mYiHsrGffCg_qgkTbUWZU1A.jpg -711158495,Lancome High Resolution Eye Cream .95 Oz Refill-3X .25 Oz Plus .20 Oz Lot,,http://thumbs1.ebaystatic.com/d/l225/m/mFuQxKoEKQ6wtk2bGxfKwow.jpg -711158496,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs4.ebaystatic.com/d/l225/m/mLBRCDiELUnYos-vFmIcc7A.jpg -711158497,Neutrogena Rapid Wrinkle Repair Eye Cream -0.5 Oz. -New-,,http://thumbs4.ebaystatic.com/d/l225/m/mE1RWpCOxkCGuuiJBX6HiBQ.jpg -711158498,20g Snail Repair Eye Cream Natural Anti-Dark Circles Puffiness Aging Wrinkles,,http://thumbs4.ebaystatic.com/d/l225/m/mh4gBNzINDwds_r778sJRjg.jpg -711158499,Vichy-Neovadiol GF Eye & Lip Contour Cream 0.5 Fl. Oz,,http://thumbs4.ebaystatic.com/d/l225/m/m_6f0ofCm7PTzuithYuZx3w.jpg -711158500,Obagi Elastiderm Eye Cream 0.5 oz. New In Box. 100% Authentic! New Packaging!,,http://thumbs2.ebaystatic.com/d/l225/m/ma0PK-ASBXUiHERR19MyImA.jpg -711158501,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream .17oz / 5ml,,http://thumbs3.ebaystatic.com/d/l225/m/m72NaXYlcXcEeqQFKWvsdZA.jpg -711158502,Kiehl's CREAMY EYE TREATMENT cream with AVOCADO 0.5 oz FULL SIZE,,http://thumbs3.ebaystatic.com/d/l225/m/mOI407HnILb_tf-RgdvfYyA.jpg -711158503,Clinique repairwear laser focus wrinkle correcting eye cream .5 oz 15ml,,http://thumbs4.ebaystatic.com/d/l225/m/mQwNVst3bYG6QXouubmLaJg.jpg -711158504,Caudalie Premier Cru The Eye Cream La Creme New Anti Aging Eye Treatment,,http://thumbs1.ebaystatic.com/d/l225/m/mM4hPTAWXeOjovNk9s_Cqag.jpg -711158505,Jeunesse Instantly Ageless -- New Box Of 50 Sachets -- Eye - Face Wrinkle Cream,,http://thumbs2.ebaystatic.com/d/l225/m/m5EfWbi6ZYs4JpYcsl0Ubaw.jpg -711158506,VELOUR SKIN EYE CREAM .5 FL OZ 15ML NEW NIP ANTI-AGING WRINKLE CREAM,,http://thumbs1.ebaystatic.com/d/l225/m/m2uEf6q1yASH8FkWqYdOv1w.jpg -711158507,Shiseido White Lucent Anti-Dark Circles/Puffiness Eye Cream 15ml/.53oz Full Size,,http://thumbs1.ebaystatic.com/d/l225/m/m_CtzoqU2Vgv4GKx8ONS6qw.jpg -711158508,Murad Resurgence Renewing Eye Cream Anti-Aging .25 oz NEW Dark Circles Wrinkle,,http://thumbs1.ebaystatic.com/d/l225/m/mhWJC10iowgUDGm4KMQKNMg.jpg -711158509,D-Link DIR-615 300Mbps Wireless-N Router 4-Port w/Firewall,,http://thumbs3.ebaystatic.com/d/l225/m/mdSBH9ROXRn3TBb8OFDT6jA.jpg -711158510,Triton MOF001 2 1/4hp dual mode precision Router. New!! *3 day auction*,,http://thumbs1.ebaystatic.com/d/l225/m/mozWd2SBskbDBlWAKsMlVew.jpg -711158511,Porter-Cable 3-1/4 HP Five-Speed Router 7518 - Power Tools Routers,,http://thumbs2.ebaystatic.com/d/l225/m/mpZDTXpiyesDrZh_FLMyqXQ.jpg -711158512,Linksys EA6900 AC1900 Wi-Fi Wireless Router Dual Band with Gigabit &USB 3.0 Port,,http://thumbs4.ebaystatic.com/d/l225/m/m3OfBSnHBDhhs_Ve-DSBKQw.jpg -711158513,Linksys EA6500 1300 Mbps 4-Port Gigabit Wireless AC Router,,http://thumbs1.ebaystatic.com/d/l225/m/m7cfymJPc7CLADoTiEYFzwA.jpg -711158514,Makita RT0700CX3 1-1/4 Horsepower Compact Router Kit / Trimmer NEW,,http://thumbs2.ebaystatic.com/d/l225/m/mr-F3rCxDYsLcj8hnmaRN4A.jpg -711158515,NETGEAR R6250 AC1600 Smart WiFi Dual Band Gigabit Router 802.11ac 300 1300 Mbps,,http://thumbs4.ebaystatic.com/d/l225/m/mc8Ic8Cq2lPqPnjNGAQBBCQ.jpg -711158516,NETGEAR Nighthawk AC1900 Dual Band Wi-Fi Gigabit Router (R7000) BRAND NEW SEALED,,http://thumbs3.ebaystatic.com/d/l225/m/mdL34EQi0l-Kg-DlvF6wpqA.jpg -711158517,Netgear WNDR3400 N600 Wireless Dual Band Router (WNDR3400-100),,http://thumbs4.ebaystatic.com/d/l225/m/mKr4cNk6utJXSdVYXzwrScQ.jpg -711158518,Netgear N600 300 Mbps 4-Port 10/100 Wireless N Router (WNDR3400),,http://thumbs2.ebaystatic.com/d/l225/m/mUPdyhbW9pzEm1VbqX0YudA.jpg -711158519,NETGEAR N600 WNDR3400 Wireless Dual Band Router F/S,,http://thumbs1.ebaystatic.com/d/l225/m/my55jF5kHnG9ipzFycnjooA.jpg -711158520,Netgear NIGHTHAWK AC1900 1300 Mbps 4-Port Gigabit Wireless AC Router (R7000),,http://thumbs3.ebaystatic.com/d/l225/m/mrPLRTnWx_JXLNIp5pCBnzQ.jpg -711158521,Netgear N900 450 Mbps 4-Port Gigabit Wireless N Router (WNDR4500),,http://thumbs2.ebaystatic.com/d/l225/m/mXBL01faHlHm7Ukh188t3yQ.jpg -711158522,Netgear R6300V2 AC1750 1300 Mbps 4-Port Gigabit Wireless AC Router,,http://thumbs1.ebaystatic.com/d/l225/m/mTdnFB9Z71efYJ9I5-k186w.jpg -711158523,Makita RT0701C 1-1/4 HP Compact Router With FACTORY WARRANTY!!!,,http://thumbs2.ebaystatic.com/d/l225/m/m7AA4k3MzYFJcTlBrT3DwhA.jpg -711158524,"CISCO LINKSYS EA4500 DUAL-BAND N9000 WIRELESS ROUTER, 802.11N, UP TO 450 MBPs",,http://thumbs4.ebaystatic.com/d/l225/m/mwfVIXD3dZYt_qpHyprd7hg.jpg -711158525,Netgear N300 v.3 300 Mbps 5-Port 10/100 Wireless N Router (WNR2000),,http://thumbs4.ebaystatic.com/d/l225/m/mopRjvnZwbsVH9euqGov5kw.jpg -711158526,Netgear Nighthawk R7000 2330 Mbps 4-Port Gigabit Wireless N Router...,,http://thumbs4.ebaystatic.com/d/l225/m/mns82UY4FfqYXPgqrpJ9Bzw.jpg -711158527,Netgear N900 450 Mbps 4-Port Gigabit Wireless N Router R4500 ~ FreE ShiPPinG ~,,http://thumbs1.ebaystatic.com/d/l225/m/m_o0mSRmySgJUuqHYDIQiuA.jpg -711158528,D-Link Wireless Router Model DIR-625,,http://thumbs2.ebaystatic.com/d/l225/m/mYPXwZMlDUjOQ3Sm3EtU37Q.jpg -711158529,D-Link DIR-657 300 Mbps 4-Port Gigabit Wireless N Router Hd Media Router 1000,"Stream multiple media content - videos, music and more to multiple devices all at the same time without lag or skipping. The HD Fuel technology in the DIR-657 lets you watch Netflix and Vudu , play your Wii or Xbox 360 online or make Skype calls all without worrying about the skipping or latency you might experience with standard routers. It does so by automatically giving extra bandwidth for video, gaming and VoIP calls using HD Fuel QoS technology. The D-Link HD Media Router 1000(DIR-657) also comes equipped with 4 Gigabit ports to provide speeds up to 10x faster than standard 10/100 ports. What s more, it uses 802.11n technology with multiple intelligent antennas to maximize the speed and range of your wireless signal to significantly outperform 802.11g devices.",http://thumbs1.ebaystatic.com/d/l225/m/m0xyPdWrdVKe7By4QFouVeA.jpg -711158530,D-Link DIR-860L AC1200 4-Port Cloud Router Gigabit Wireless 802.11 AC,,http://thumbs3.ebaystatic.com/d/l225/m/mk4KNj6oLm7863qCS-TqmbQ.jpg -711158531,D-Link DIR-862L Wireless AC1600 Dual Band Gigabit Router,,http://thumbs2.ebaystatic.com/d/l225/m/m6Arw8kaZ4EUbyKjHtJZLkA.jpg -711158532,LINKSYS AC1600 DUAL BAND SMART WI-FI ROUTER EA6400 BRAND NEW,,http://thumbs3.ebaystatic.com/d/l225/m/mdK7igTS7_TDD7ajfVqj-_w.jpg -711158533,Netgear AC1900 1300 Mbps 4-Port Gigabit Wireless AC Router (R7000),,http://thumbs4.ebaystatic.com/d/l225/m/mdL34EQi0l-Kg-DlvF6wpqA.jpg -711158534,Panasonic ES-LA63 Cordless Rechargeable Men's Electric Shaver,,http://thumbs3.ebaystatic.com/d/l225/m/mzKKlCxbADObevcgoNjbXRg.jpg -711158535,Panasonic ARC 5 Best Mens Shaver,,http://thumbs4.ebaystatic.com/d/l225/m/mt34Y-u0okj-SqQm8Ng_rbQ.jpg -711158536,Panasonic Es8092 Wet Dry Electric Razor Shaver Cordless,,http://thumbs3.ebaystatic.com/d/l225/m/mlIxTz1LsVjXiZz2CzDquJw.jpg -711158537,Panasonic ARC4 ES-RF31-s Rechargeable Electric Shaver Wet/dry 4 Nanotech Blade,"Made for folks who need a great shave, the Panasonic electric shaver is convenient and consistent. Featuring an ergonomic design, this Panasonic ES-RF31-S is ideal for keeping a stubble-free face, so you can retain wonderfully smooth skin. With the precision blades included on the Panasonic electric shaver, you can get smooth shaves with every use. As this men's electric shaver features a gentle shaving mechanism, you can help avoid burning sensations on tender skin. Make sure you consistently get multiple perfect shaves without depleting the power with the exceptional shave time typical of this Panasonic ES-RF31-S.",http://thumbs1.ebaystatic.com/d/l225/m/mi4QM99Jq4oma5WLAL0K7Wg.jpg -711158538,"Panasonic ES3831K Single Blade Travel Shaver, Black New","Strong and trustworthy, the Panasonic electric shaver is built for folks who are worried about a wonderful shave every day. This Panasonic ES3833S is just right for taming your beard, with an easy-to-maneuver design, so you can retain wonderfully soft skin. Spend as much time as you need getting a complete shave by making use of the outstanding shave time typical of the Panasonic electric shaver. Moreover, this men's electric shaver includes precision foil blades, so you can get wonderful shaves over a prolonged period. With the gentle shaving mechanism on this Panasonic ES3833S, you can help avoid burning sensations on tender skin.",http://thumbs3.ebaystatic.com/d/l225/m/mfqMoj4xDlBFXp1ZznxCGbQ.jpg -711158539,Panasonic ES8103S Arc3 Electric Shaver Wet/Dry with Nanotech Blades for Men,,http://thumbs1.ebaystatic.com/d/l225/m/myaZLqzt3I7O-3xXxsJ_4fQ.jpg -711158540,Panasonic ES8103S Arc3 Electric Shaver Wet/Dry with Nanotech Blades,,http://thumbs1.ebaystatic.com/d/l225/m/mcrO4BkjBkM78XHm-aClRGg.jpg -711158543,Panasonic ES3831K Single Blade Wet & Dry Travel Shaver - New & Sealed,,http://thumbs4.ebaystatic.com/d/l225/m/mqWDU2mHsFWAuGosMIGcIMg.jpg -711158544,Panasonic ES8103S Arc 3 E W/O POUCH & MANUAL Men's Wet/Dry Rechargeable Shaver,,http://thumbs2.ebaystatic.com/d/l225/m/mZXgTj-fQfcgAlzOGQYkqFw.jpg -711158545,PANASONIC ES3831K Pro-Curve Battery Operated Travel Wet/Dry Shaver,,http://thumbs1.ebaystatic.com/d/l225/m/m8McQMCfgdp50trM_YJ88cw.jpg -711158546,PANASONIC ARC3 ES-LT33-S WET DRY WASHABLE RECHARGEABLE MEN'S ELECTRIC SHAVER NIB,,http://thumbs1.ebaystatic.com/d/l225/m/m9yUif5xyhGfh7Ag-_fcLdA.jpg -711158547,Panasonic ES-LV81-k Arc 5 Wet & Dry Rechargeable Men's Foil Shaver New,,http://thumbs1.ebaystatic.com/d/l225/m/mEfZHzDoKrH4DBfU8e_K93A.jpg -711158548,"NEW Panasonic ES-RF31-S 4 Blade Men's Electric Razor Wet/Dry, Factory Sealed",,http://thumbs2.ebaystatic.com/d/l225/m/mfhMhMoDkrGtqWW_IyqVGuQ.jpg -711158549,Panasonic ES8243A E Arc4 Men's Electric Shaver Wet/Dry,"eBay item number:181670746515 - - - Seller assumes all responsibility for this listing. - - Last updated on -  Mar 23, 2015 08:55:50 PDT  - View all revisions - - - - - - Item specifics - - - -
Condition:Used - : - - -
",http://thumbs4.ebaystatic.com/d/l225/m/mcxFUwt3FrGEEPzT7cfQn7w.jpg -711158550,Panasonic ES-3833 Wet/Dry Men Shaver Razor Battery Operate Compact Travel ES3833,,http://thumbs2.ebaystatic.com/d/l225/m/mAqa9pHisKsLSk5nqMg4JJQ.jpg -711158551,Panasonic Pro-Curve ES3831K Shaver - Dry/Wet Technology - Stainless Steel Foil,,http://thumbs3.ebaystatic.com/d/l225/m/mGqD8eGIwseT5nsM53W3uRQ.jpg -711158552,Panasonic Wet and Dry Shaver - ES-RW30s ES-RW30-S,"The Panasonic electric shaver is well-suited to shielding particularly sensitive skin and providing a smooth shave. It's both trustworthy and transportable. Because this Panasonic ES-RW30-S has a gentle shaving mechanism, you can avoid irritation and raw feeling skin in particularly tender areas. The Panasonic electric shaver is ideal for ridding yourself of stubble, with its special design, so you can sustain wonderfully supple skin. The exceptional shave time featured on this men's electric shaver helps you to make sure you consistently receive many complete shaves without depleting the power. Plus, this Panasonic ES-RW30-S features precision blades, so you can enjoy smooth shaves for months on end.",http://thumbs1.ebaystatic.com/d/l225/m/mvPElpjXmgo0NhP-P5F8LlQ.jpg -711158553,Panasonic ES-LF51-A Arc4 Electric Shaver Wet/Dry with Flexible Pivoting Head,,http://thumbs3.ebaystatic.com/d/l225/m/mC_zAQrMQKPLHdENU7N3UjQ.jpg -711158554,Panasonic ES8103S Arc3 Men's Electric Shaver Wet/Dry with Nanotech Blades,,http://thumbs3.ebaystatic.com/d/l225/m/moBByNwPn93-g-oBBceS2kw.jpg -711158555,panasonic ARC3 shaver es8103s,,http://thumbs1.ebaystatic.com/d/l225/m/mJlAp6t6OMIOaYgKnyelIMg.jpg -711158556,Panasonic ES-534 Men's Electric Shaver New ES534 Battery Operated Compact Travel,,http://thumbs3.ebaystatic.com/d/l225/m/mDr2kpZLVSdy1KTPVYK2YUg.jpg -711158557,Panasonic Portable Shaving Machine Cclippers Washable Single Blade Shaver+Brush,,http://thumbs3.ebaystatic.com/d/l225/m/mJdzJPoOALps0Lv4WtW2b0A.jpg -711158559,Baratza Solis Maestro Conical Burr Coffee Bean Grinder Works Great Nice Cond,,http://thumbs4.ebaystatic.com/d/l225/m/mdjbD7YFR6JRq-pkeajhK7w.jpg -711158560,Proctor Silex Fresh Grind Electric Coffee Bean Grinder White,,http://thumbs4.ebaystatic.com/d/l225/m/mtXoRn5Ytmqz0GLHYmBUxpA.jpg -711158561,Cuisinart 8-oz. Supreme Grind Automatic Burr Coffee Grinder,,http://thumbs4.ebaystatic.com/d/l225/m/my_9cXPvwwRVFqo6MXWfpag.jpg diff --git a/examples/kaggle/schema-kaggle.toml b/examples/kaggle/schema-kaggle.toml new file mode 100644 index 000000000..fcf2685e9 --- /dev/null +++ b/examples/kaggle/schema-kaggle.toml @@ -0,0 +1,19 @@ +# This schema has been generated ... +# The order in which the attributes are declared is important, +# it specify the attribute xxx... + +identifier = "id" + +[attributes.id] +stored = true + +[attributes.title] +stored = true +indexed = true + +[attributes.description] +stored = true +indexed = true + +[attributes.image] +stored = true From b701eb85b8f33fde36a82bf0837dc5c547779c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 15:52:03 +0200 Subject: [PATCH 24/32] doc: Update the README features links --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5cc199284..f64015bbd 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,19 @@ A _full-text search database_ using a key-value store internally. ## Features -- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/criterion/mod.rs#L94-L105) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents -- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/criterion/mod.rs#L24-L31) and can apply them in any custom order -- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/query_builder.rs#L165), useful for paginating results -- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/query_builder.rs#L96) and [filter](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/query_builder.rs#L85) returned documents based on context defined rules -- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/20b5a6a06e4b897313e83e24fe1e1e47c660bfe8/examples/schema-example.toml) -- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/a960c325f30f38be6a63634b3bd621daf82912a8/src/tokenizer/mod.rs) can index latin and kanji based languages -- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/mod.rs#L15-L18), useful to highlight matched words in results -- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/e0b759839d552f02e3dd0064948f4d8022415ed7/src/rank/query_builder.rs#L107) +- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L95-L101) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents +- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L22-L29) and can apply them in any custom order +- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L146), useful for paginating results +- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L68) and [filter](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L57) returned documents based on context defined rules +- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/examples/movies/schema-movies.toml) +- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-tokenizer/src/lib.rs#L99) can index latin and kanji based languages +- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/lib.rs#L117-L120), useful to highlight matched words in results +- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L79) - Supports run time indexing (incremental indexing) -It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances. +It uses [sled](https://github.com/spacejam/sled) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances. You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents. @@ -70,4 +70,4 @@ Once the command is executed, the index should be in the `test.mdb` folder. You ```bash cargo run --release --example query-database -- test.mdb -n 10 id title overview release_date -``` \ No newline at end of file +``` From 021f0545eb6cab5420f5dfe9620d4829d5a98a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 May 2019 16:34:06 +0200 Subject: [PATCH 25/32] doc: Update the deep-dive explanation text --- deep-dive.md | 91 +++++++++++++--------------------------------------- 1 file changed, 23 insertions(+), 68 deletions(-) diff --git a/deep-dive.md b/deep-dive.md index 1a3c19134..05298c932 100644 --- a/deep-dive.md +++ b/deep-dive.md @@ -1,28 +1,22 @@ # A deep dive in MeiliDB -On the 9 of december 2018. - -MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [RocksDB](https://github.com/facebook/rocksdb). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the data as an [inverted index](https://en.wikipedia.org/wiki/Inverted_index). - +On the 15 of May 2019. +MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index). - [Where is the data stored?](#where-is-the-data-stored) - [What does the key-value store contains?](#what-does-the-key-value-store-contains) - - [The blob type](#the-blob-type) + - [The inverted word index](#the-inverted-word-index) - [A final state transducer](#a-final-state-transducer) - [Document indexes](#document-indexes) - - [Document ids](#document-ids) - [The schema](#the-schema) - [Document attributes](#document-attributes) -- [How is an update handled?](#how-is-an-update-handled) - - [The merge operation is CPU consuming](#the-merge-operation-is-cpu-consuming) - [How is a request processed?](#how-is-a-request-processed) - [Query lexemes](#query-lexemes) - [Automatons and query index](#automatons-and-query-index) - [Sort by criteria](#sort-by-criteria) - - [Retrieve original documents](#retrieve-original-documents) @@ -30,21 +24,17 @@ MeiliDB is a full text search engine based on a final state transducer named [fs MeiliDB is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time. -[RocksDB brings some](https://rocksdb.org/blog/2015/02/27/write-batch-with-index.html) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent, for example we use SST files and the key-value store ability to load them in one time to manage updates. - -Note that the SST file have the same restriction as the fst, it needs its keys to be added in order at creation. +[sled will brings some](https://github.com/spacejam/sled/tree/434533332a3f485e6d2e467023be0a0b55d3a1af#plans) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent. ## What does the key-value store contains? -It contain the blob, the schema and the documents stored attributes. +It contain the inverted word index, the schema and the documents fields. -### The blob type +### The inverted word index -[The Blob type](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/mod.rs#L16-L19) is a data structure that indicate if an update is a positive or a negative one. In the case where the update is considered positive, the blob will contain [an fst map and the document indexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15-L18) associated. In the other case it will only contain [all the document ids](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/negative/blob.rs#L12-L14) that must be considered removed. - -The Blob type [is stored under the "*data-index*" entry](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/update/positive/update.rs#L497-L499) and marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation) in the key-value store. +[The inverted word index](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L35-L51). #### A final state transducer @@ -52,89 +42,54 @@ _...also abbreviated fst_ This is the first entry point of the engine, you can read more about how it work with the beautiful blog post of @BurntSushi, [Index 1,600,000,000 Keys with Automata and Rust](https://blog.burntsushi.net/transducers/). -To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index associated with a value that, for the moment, can only be an `u64`. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used. - -Note that the number under each word is auto-incremental, each new word have a new number that is greater than the previous one. - -Another powerful feature of `fst` is that it can nearly avoid using RAM and be streamed to disk for example, the problem is that the keys must be always added in lexicographic order, so you must sort them before, for the moment MeiliDB uses a [BTreeMap](https://github.com/Kerollmops/raptor-rs/blob/8abdb0a228e2808fe1814a6a0641a4b72d158579/src/metadata/doc_indexes.rs#L107-L112). +To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used. #### Document indexes -As it has been specified, the `fst` can only store a number corresponding to a word, an `u64`, but the goal of the search engine is to retrieve a match in a document when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word match. +The `fst` will only return the words that match with the search automaton but the goal of the search engine is to retrieve all matches in all the documents when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word matched. -To make it possible, a custom data structure has been developed, the document indexes is composed of two arrays, the ranges array and all the docindexes corresponding to a given range, each range identify the word number. The [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/data/doc_indexes.rs#L23) type is designed to be streamed when constructed, consumming a minimum amount of ram like the fst. Another advantage is that the slices are accessible in `O(1)` when you know the word associated number. - -#### Document ids - -This is a simple ordered list of all documents ids which must be considered deleted. It is used with [the sdset library](https://docs.rs/sdset/0.3.0/sdset/duo/struct.DifferenceByKey.html), the docindexes and the `DifferenceByKey` operation builder when merging blobs. - -When a blob represent a negative update it only contains this simple slice of deleted documents ids. +To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words. ### The schema -The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under the "_data-schema_" entry and given to MeiliDB only at the creation. +The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/main_index.rs#L12) Tree and given to MeiliDB only at the creation of an index. -Each document attribute is associated to a unique 32 bit number named `SchemaAttr`. +Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/schema.rs#L186). -In the future this schema type could be given along with updates and probably be different from the original, the database could be able to handled this document structure and reindex it. +In the future, this schema type could be given along with updates, the database could be able to handled a new schema and reindex the database according to the new one. ### Document attributes -When the engine handle a query the result that the requester want is a document, not only the [match](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/lib.rs#L51-L79) associated to it, fields of the original document must be returned too. +When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too. -So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_. The key is prefixed by "_doc_" followed by the 64 bit document id in bytes and the schema attribute number in bytes corresponding to the document attribute stored. +So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/documents_index.rs#L11). -When a document field is saved in the key-value store its value is binary encoded using the [bincode](https://docs.rs/bincode/) library, so a document must be serializable using serde. - - - -## How is an update handled? - -First of all an update in MeiliDB is nothing more than [a RocksDB SST file](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files). It contains the blob and all the documents attributes binary encoded like described above. Note that the blob is stored under the "_data-index_" key marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation). - -### The merge operation is CPU consuming - -When [the database ingest an update](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L108-L145) it gives the SST file to the underlying RocksDB, once it has ingested it there is a "_data-index_" entry available, we can request it but the key-value store will call a function before, a merge operation is performed. - -This merge operation is done on multiple blobs as you have understood and will compute a [PositiveBlob](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15), this type contains the fst and document indexes structures allowing us to search for documents. This two data structures can be considered as the inverted index. - -The computation time of this merge is important, RocksDB doesn't keep the previous merged result, it will call our merge operation each time until it decided to do a compaction. So [we must force this compaction earlier](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L129-L131) when we receive an update to reduce this cost. - -This way when we request the "_data-index_" value it will gives us the previously merged positive blob without any other merge overhead. +When a document field is saved in the key-value store its value is binary encoded using [message pack](https://github.com/3Hren/msgpack-rust), so a document must be serializable using serde. ## How is a request processed? -Now that we have our "_data-index_" we are able to return results based on a query. In the MeiliDB universe a query is a string. +Now that we have our inverted index we are able to return results based on a query. In the MeiliDB universe a query is a simple string containing words. ### Query lexemes -The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/tokenizer/mod.rs) that is not finished for the moment, [there is an open issue](https://github.com/Kerollmops/MeiliDB/issues/3). Note that a tokenizer is specialized for a human language, this is the hard part. +The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part. ### Automatons and query index -So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/automaton.rs#L62-L75) with different settings. +So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/automaton.rs#L59-L78) with different settings. -Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst map, it will allow us to know which [automaton returns a word according to its index](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/metadata/ops.rs#L111). The `Stream` is able to return all the numbers associated to the words. We use these numbers to find the whole list of `DocIndexes` associated and do the union set operation. +Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst set. The `Stream` is able to return all the matching words. We use these words to find the whole list of `DocIndexes` associated. -With all these informations it is possible [to reconstruct a list of all the DocIndexes associated](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L62-L99) with the words queried. +With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L103-L130) with the words queried. ### Sort by criteria Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions. -With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L108-L119) using bucket sorting. [Each criterion](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/criterion/mod.rs#L75-L87) is evaluated on each subslice without copy, thanks to [GroupByMut](https://github.com/Kerollmops/group-by/blob/cab857bae01463dbd0edb99b0e0d7f3624e6c6f5/src/lib.rs#L180-L185) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477). - -Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the `retrieve_document` method. - -### Retrieve original documents - -The [DatabaseView](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L18-L24) structure that you must have created to be able to query the database have [two functions](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L60-L76) that allows you to retrieve a full (or not) document according to the schema you specified at creation time (i.e. the _STORED_ attributes). - -As you can see, these functions force the created type `T` to implement [the serde Deserialize trait](https://docs.rs/serde/1.0.81/serde/trait.Deserialize.html), MeiliDB will use the `bincode::deserialise` function for each attribute to construct your type and return it to you. - +With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477). +Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/index.rs#L86). At this point, MeiliDB work is over 🎉 - From 3de633c869dd7f65779eaa48b5995082cad29d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 May 2019 11:57:51 +0200 Subject: [PATCH 26/32] feat: Reexport sled to reduce user level library incompatibilities --- meilidb-data/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 22d58fe08..79cc3a3e9 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -6,6 +6,7 @@ mod ranked_map; mod serde; pub mod schema; +pub use sled; pub use self::database::{Database, Index, CustomSettings}; pub use self::number::Number; pub use self::ranked_map::RankedMap; From 921b063a7112e4e29f13edf8add43006e4c15f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 May 2019 11:59:31 +0200 Subject: [PATCH 27/32] feat: Make the `DocumentsDeletion` public interface to take serde types --- .../src/database/documents_deletion.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs index 775b8d7f6..5f1424f3b 100644 --- a/meilidb-data/src/database/documents_deletion.rs +++ b/meilidb-data/src/database/documents_deletion.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use sdset::{SetBuf, SetOperation, duo::DifferenceByKey}; use fst::{SetBuilder, Streamer}; use meilidb_core::DocumentId; +use crate::serde::extract_document_id; use super::{Index, Error, InnerIndex}; @@ -17,10 +18,26 @@ impl<'a> DocumentsDeletion<'a> { DocumentsDeletion { inner, documents: Vec::new() } } - pub fn delete_document(&mut self, id: DocumentId) { + fn delete_document_by_id(&mut self, id: DocumentId) { self.documents.push(id); } + pub fn delete_document(&mut self, document: D) -> Result<(), Error> + where D: serde::Serialize, + { + let schema = &self.inner.lease_inner().schema; + let identifier = schema.identifier_name(); + + let document_id = match extract_document_id(identifier, &document)? { + Some(id) => id, + None => return Err(Error::MissingDocumentId), + }; + + self.delete_document_by_id(document_id); + + Ok(()) + } + pub fn finalize(mut self) -> Result<(), Error> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; From 4b36fa073990a90b6599405a8917cfa0f60cfd00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 May 2019 12:00:20 +0200 Subject: [PATCH 28/32] test: Add tests about additions and deletions of documents --- meilidb-data/Cargo.toml | 3 ++ meilidb-data/tests/updates.rs | 67 +++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 meilidb-data/tests/updates.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index eb0d92f55..adf618657 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -27,3 +27,6 @@ rev = "40b3d48" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" branch = "arc-byte-slice" + +[dev-dependencies] +tempfile = "3.0.7" diff --git a/meilidb-data/tests/updates.rs b/meilidb-data/tests/updates.rs new file mode 100644 index 000000000..393b2ecba --- /dev/null +++ b/meilidb-data/tests/updates.rs @@ -0,0 +1,67 @@ +use serde_json::json; +use meilidb_data::{Database, Schema}; +use meilidb_data::schema::{SchemaBuilder, STORED, INDEXED}; + +fn simple_schema() -> Schema { + let mut builder = SchemaBuilder::with_identifier("objectId"); + builder.new_attribute("objectId", STORED | INDEXED); + builder.new_attribute("title", STORED | INDEXED); + builder.build() +} + +#[test] +fn insert_delete_document() { + let tmp_dir = tempfile::tempdir().unwrap(); + let database = Database::start_default(&tmp_dir).unwrap(); + + let schema = simple_schema(); + let index = database.create_index("hello", schema).unwrap(); + + let doc1 = json!({ "objectId": 123, "title": "hello" }); + + let mut addition = index.documents_addition(); + addition.update_document(&doc1).unwrap(); + addition.finalize().unwrap(); + + let docs = index.query_builder().query("hello", 0..10).unwrap(); + assert_eq!(docs.len(), 1); + assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1)); + + let mut deletion = index.documents_deletion(); + deletion.delete_document(&doc1).unwrap(); + deletion.finalize().unwrap(); + + let docs = index.query_builder().query("hello", 0..10).unwrap(); + assert_eq!(docs.len(), 0); +} + +#[test] +fn replace_document() { + let tmp_dir = tempfile::tempdir().unwrap(); + let database = Database::start_default(&tmp_dir).unwrap(); + + let schema = simple_schema(); + let index = database.create_index("hello", schema).unwrap(); + + let doc1 = json!({ "objectId": 123, "title": "hello" }); + let doc2 = json!({ "objectId": 123, "title": "coucou" }); + + let mut addition = index.documents_addition(); + addition.update_document(&doc1).unwrap(); + addition.finalize().unwrap(); + + let docs = index.query_builder().query("hello", 0..10).unwrap(); + assert_eq!(docs.len(), 1); + assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1)); + + let mut deletion = index.documents_addition(); + deletion.update_document(&doc2).unwrap(); + deletion.finalize().unwrap(); + + let docs = index.query_builder().query("hello", 0..10).unwrap(); + assert_eq!(docs.len(), 0); + + let docs = index.query_builder().query("coucou", 0..10).unwrap(); + assert_eq!(docs.len(), 1); + assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2)); +} From 08d89053da4c8da9ef06b4f280bcd70b4b6f2edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 May 2019 15:51:08 +0200 Subject: [PATCH 29/32] feat: Introduce a little simple http server for demo --- README.md | 29 +++++++-- meilidb-data/Cargo.toml | 2 +- meilidb/Cargo.toml | 14 ++--- meilidb/src/common_words.rs | 26 -------- meilidb/src/lib.rs | 7 --- meilidb/src/main.rs | 74 ++++++++++++++++++++++ meilidb/src/sort_by_attr.rs | 121 ------------------------------------ 7 files changed, 104 insertions(+), 169 deletions(-) delete mode 100644 meilidb/src/common_words.rs delete mode 100644 meilidb/src/lib.rs create mode 100644 meilidb/src/main.rs delete mode 100644 meilidb/src/sort_by_attr.rs diff --git a/README.md b/README.md index f64015bbd..cb98d2dea 100644 --- a/README.md +++ b/README.md @@ -59,15 +59,34 @@ We have seen much better performances when [using jemalloc as the global allocat ## Usage and examples -MeiliDB runs with an index like most search engines. -So to test the library you can create one by indexing a simple csv file. +You can test a little part of MeiliDB by using this command, it create an index named _movies_ and initialize it with to great Tarantino movies. ```bash -cargo run --release --example create-database -- test.mdb examples/movies/movies.csv --schema examples/movies/schema-movies.toml +cargo run --release + +curl -XPOST 'http://127.0.0.1:8000/movies' \ + -d ' +identifier = "id" + +[attributes.id] +stored = true + +[attributes.title] +stored = true +indexed = true +' + +curl -H 'Content-Type: application/json' \ + -XPUT 'http://127.0.0.1:8000/movies' \ + -d '{ "id": 123, "title": "Inglorious Bastards" }' + +curl -H 'Content-Type: application/json' \ + -XPUT 'http://127.0.0.1:8000/movies' \ + -d '{ "id": 456, "title": "Django Unchained" }' ``` -Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB. +Once the database is initialized you can query it by using the following command: ```bash -cargo run --release --example query-database -- test.mdb -n 10 id title overview release_date +curl -XGET 'http://127.0.0.1:8000/movies/search?q=inglo' ``` diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index adf618657..0c5ff9f5e 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -14,7 +14,7 @@ meilidb-core = { path = "../meilidb-core", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } sdset = "0.3.1" -serde = { version = "1.0.90", features = ["derive"] } +serde = { version = "1.0.91", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.23.0" toml = { version = "0.5.0", features = ["preserve_order"] } diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index c2f4ad0fc..8f9517419 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -5,23 +5,19 @@ version = "0.3.1" authors = ["Kerollmops "] [dependencies] -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } meilidb-data = { path = "../meilidb-data", version = "0.1.0" } -meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } - -[features] -default = [] -i128 = ["meilidb-core/i128"] -nightly = ["meilidb-core/nightly"] +serde = { version = "1.0.91" , features = ["derive"] } +serde_json = "1.0.39" +tempfile = "3.0.7" +tide = "0.2.0" [dev-dependencies] +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } csv = "1.0.7" env_logger = "0.6.1" jemallocator = "0.1.9" quickcheck = "0.8.2" rand = "0.6.5" rand_xorshift = "0.1.1" -serde = { version = "1.0.90", features = ["derive"] } structopt = "0.2.15" -tempfile = "3.0.7" termcolor = "1.0.4" diff --git a/meilidb/src/common_words.rs b/meilidb/src/common_words.rs deleted file mode 100644 index 32cd79336..000000000 --- a/meilidb/src/common_words.rs +++ /dev/null @@ -1,26 +0,0 @@ -use std::io::{self, BufReader, BufRead}; -use std::collections::HashSet; -use std::path::Path; -use std::fs::File; - -#[derive(Debug)] -pub struct CommonWords(HashSet); - -impl CommonWords { - pub fn from_file

(path: P) -> io::Result - where P: AsRef - { - let file = File::open(path)?; - let file = BufReader::new(file); - let mut set = HashSet::new(); - for line in file.lines().filter_map(|l| l.ok()) { - let word = line.trim().to_owned(); - set.insert(word); - } - Ok(CommonWords(set)) - } - - pub fn contains(&self, word: &str) -> bool { - self.0.contains(word) - } -} diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs deleted file mode 100644 index aba7ab6a7..000000000 --- a/meilidb/src/lib.rs +++ /dev/null @@ -1,7 +0,0 @@ -#![cfg_attr(feature = "nightly", feature(test))] - -mod common_words; -mod sort_by_attr; - -pub use self::sort_by_attr::SortByAttr; -pub use self::common_words::CommonWords; diff --git a/meilidb/src/main.rs b/meilidb/src/main.rs new file mode 100644 index 000000000..822d941b5 --- /dev/null +++ b/meilidb/src/main.rs @@ -0,0 +1,74 @@ +#![feature(async_await)] + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use tide::querystring::ExtractQuery; +use tide::http::status::StatusCode; +use tide::{error::ResultExt, response, App, Context, EndpointResult}; +use serde_json::Value; +use meilidb_data::{Database, Schema}; + +#[derive(Debug, Serialize, Deserialize, Clone)] +struct SearchQuery { + q: String, +} + +async fn create_index(mut cx: Context) -> EndpointResult<()> { + let index: String = cx.param("index").client_err()?; + let schema = cx.body_bytes().await.client_err()?; + let schema = Schema::from_toml(schema.as_slice()).unwrap(); + + let database = cx.app_data(); + database.create_index(&index, schema).unwrap(); + + Ok(()) +} + +async fn update_documents(mut cx: Context) -> EndpointResult<()> { + let index: String = cx.param("index").client_err()?; + let document: HashMap = cx.body_json().await.client_err()?; + + let database = cx.app_data(); + let index = match database.open_index(&index).unwrap() { + Some(index) => index, + None => Err(StatusCode::NOT_FOUND)?, + }; + + let mut addition = index.documents_addition(); + addition.update_document(document).unwrap(); + addition.finalize().unwrap(); + + Ok(()) +} + +async fn search_index(cx: Context) -> EndpointResult { + let index: String = cx.param("index").client_err()?; + let query: SearchQuery = cx.url_query()?; + + let database = cx.app_data(); + + let index = match database.open_index(&index).unwrap() { + Some(index) => index, + None => Err(StatusCode::NOT_FOUND)?, + }; + + let documents_ids = index.query_builder().query(&query.q, 0..100).unwrap(); + let documents: Vec = documents_ids + .into_iter() + .filter_map(|x| index.document(None, x.id).unwrap()) + .collect(); + + Ok(response::json(documents)) +} + +fn main() -> std::io::Result<()> { + let tmp_dir = tempfile::tempdir().unwrap(); + let database = Database::start_default(&tmp_dir).unwrap(); + let mut app = App::new(database); + + app.at("/:index").post(create_index).put(update_documents); + app.at("/:index/search").get(search_index); + + app.serve("127.0.0.1:8000") +} diff --git a/meilidb/src/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs deleted file mode 100644 index 2cacaae13..000000000 --- a/meilidb/src/sort_by_attr.rs +++ /dev/null @@ -1,121 +0,0 @@ -use std::cmp::Ordering; -use std::error::Error; -use std::fmt; - -use meilidb_core::criterion::Criterion; -use meilidb_core::RawDocument; -use meilidb_data::{Schema, SchemaAttr, RankedMap}; - -/// An helper struct that permit to sort documents by -/// some of their stored attributes. -/// -/// # Note -/// -/// If a document cannot be deserialized it will be considered [`None`][]. -/// -/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`, -/// so you must check the [`Ord`] of `Option` implementation. -/// -/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None -/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord -/// -/// # Example -/// -/// ```ignore -/// use serde_derive::Deserialize; -/// use meilidb::rank::criterion::*; -/// -/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; -/// -/// let builder = CriteriaBuilder::with_capacity(8) -/// .add(SumOfTypos) -/// .add(NumberOfWords) -/// .add(WordsProximity) -/// .add(SumOfWordsAttribute) -/// .add(SumOfWordsPosition) -/// .add(Exact) -/// .add(custom_ranking) -/// .add(DocumentId); -/// -/// let criterion = builder.build(); -/// -/// ``` -pub struct SortByAttr<'a> { - ranked_map: &'a RankedMap, - attr: SchemaAttr, - reversed: bool, -} - -impl<'a> SortByAttr<'a> { - pub fn lower_is_better( - ranked_map: &'a RankedMap, - schema: &Schema, - attr_name: &str, - ) -> Result, SortByAttrError> - { - SortByAttr::new(ranked_map, schema, attr_name, false) - } - - pub fn higher_is_better( - ranked_map: &'a RankedMap, - schema: &Schema, - attr_name: &str, - ) -> Result, SortByAttrError> - { - SortByAttr::new(ranked_map, schema, attr_name, true) - } - - fn new( - ranked_map: &'a RankedMap, - schema: &Schema, - attr_name: &str, - reversed: bool, - ) -> Result, SortByAttrError> - { - let attr = match schema.attribute(attr_name) { - Some(attr) => attr, - None => return Err(SortByAttrError::AttributeNotFound), - }; - - if !schema.props(attr).is_ranked() { - return Err(SortByAttrError::AttributeNotRegisteredForRanking); - } - - Ok(SortByAttr { ranked_map, attr, reversed }) - } -} - -impl<'a> Criterion for SortByAttr<'a> { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = self.ranked_map.get(lhs.id, self.attr); - let rhs = self.ranked_map.get(rhs.id, self.attr); - - match (lhs, rhs) { - (Some(lhs), Some(rhs)) => { - let order = lhs.cmp(&rhs); - if self.reversed { order.reverse() } else { order } - }, - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (None, None) => Ordering::Equal, - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum SortByAttrError { - AttributeNotFound, - AttributeNotRegisteredForRanking, -} - -impl fmt::Display for SortByAttrError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use SortByAttrError::*; - match self { - AttributeNotFound => f.write_str("attribute not found in the schema"), - AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"), - } - } -} - -impl Error for SortByAttrError { } From 8c714734986b685717c65cd46f2efe86cf22fc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 20 May 2019 11:18:59 +0200 Subject: [PATCH 30/32] feat: Introduce the `Criterion::name` to allow better debugging --- meilidb-core/src/criterion/document_id.rs | 4 ++++ meilidb-core/src/criterion/exact.rs | 4 ++++ meilidb-core/src/criterion/mod.rs | 11 +++++++++++ meilidb-core/src/criterion/number_of_words.rs | 4 ++++ meilidb-core/src/criterion/sum_of_typos.rs | 4 ++++ .../src/criterion/sum_of_words_attribute.rs | 4 ++++ .../src/criterion/sum_of_words_position.rs | 4 ++++ meilidb-core/src/criterion/words_proximity.rs | 4 ++++ meilidb-core/src/query_builder.rs | 16 ++++++++-------- 9 files changed, 47 insertions(+), 8 deletions(-) diff --git a/meilidb-core/src/criterion/document_id.rs b/meilidb-core/src/criterion/document_id.rs index 27025a2da..34d0bd7f5 100644 --- a/meilidb-core/src/criterion/document_id.rs +++ b/meilidb-core/src/criterion/document_id.rs @@ -9,4 +9,8 @@ impl Criterion for DocumentId { fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { lhs.id.cmp(&rhs.id) } + + fn name(&self) -> &'static str { + "DocumentId" + } } diff --git a/meilidb-core/src/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs index b76e9ace5..b038b9076 100644 --- a/meilidb-core/src/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -36,4 +36,8 @@ impl Criterion for Exact { lhs.cmp(&rhs).reverse() } + + fn name(&self) -> &'static str { + "Exact" + } } diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index 2ad3a183c..2ed5a54ba 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -22,6 +22,9 @@ pub use self::{ pub trait Criterion: Send + Sync { fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; + #[inline] + fn name(&self) -> &'static str; + #[inline] fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { self.evaluate(lhs, rhs) == Ordering::Equal @@ -33,6 +36,10 @@ impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { (**self).evaluate(lhs, rhs) } + fn name(&self) -> &'static str { + (**self).name() + } + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { (**self).eq(lhs, rhs) } @@ -43,6 +50,10 @@ impl Criterion for Box { (**self).evaluate(lhs, rhs) } + fn name(&self) -> &'static str { + (**self).name() + } + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { (**self).eq(lhs, rhs) } diff --git a/meilidb-core/src/criterion/number_of_words.rs b/meilidb-core/src/criterion/number_of_words.rs index 798123e6a..43095a066 100644 --- a/meilidb-core/src/criterion/number_of_words.rs +++ b/meilidb-core/src/criterion/number_of_words.rs @@ -24,4 +24,8 @@ impl Criterion for NumberOfWords { lhs.cmp(&rhs).reverse() } + + fn name(&self) -> &'static str { + "NumberOfWords" + } } diff --git a/meilidb-core/src/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs index 714766a20..d5cd75f08 100644 --- a/meilidb-core/src/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -53,6 +53,10 @@ impl Criterion for SumOfTypos { lhs.cmp(&rhs).reverse() } + + fn name(&self) -> &'static str { + "SumOfTypos" + } } #[cfg(test)] diff --git a/meilidb-core/src/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs index a46787797..f5a3c3576 100644 --- a/meilidb-core/src/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -35,4 +35,8 @@ impl Criterion for SumOfWordsAttribute { lhs.cmp(&rhs) } + + fn name(&self) -> &'static str { + "SumOfWordsAttribute" + } } diff --git a/meilidb-core/src/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs index 86f4e93fa..e365aef3b 100644 --- a/meilidb-core/src/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -35,4 +35,8 @@ impl Criterion for SumOfWordsPosition { lhs.cmp(&rhs) } + + fn name(&self) -> &'static str { + "SumOfWordsPosition" + } } diff --git a/meilidb-core/src/criterion/words_proximity.rs b/meilidb-core/src/criterion/words_proximity.rs index fc6c8bb31..10f167bef 100644 --- a/meilidb-core/src/criterion/words_proximity.rs +++ b/meilidb-core/src/criterion/words_proximity.rs @@ -98,6 +98,10 @@ impl Criterion for WordsProximity { lhs.cmp(&rhs) } + + fn name(&self) -> &'static str { + "WordsProximity" + } } #[cfg(test)] diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 058c61ecc..b32fa3902 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -157,13 +157,11 @@ where S: Store, let mut groups = vec![documents.as_mut_slice()]; - 'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() { + 'criteria: for criterion in self.criteria.as_ref() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut documents_seen = 0; for group in tmp_groups { - info!("criterion {}, documents group of size {}", ci, group.len()); - // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < range.start { @@ -174,9 +172,11 @@ where S: Store, let start = Instant::now(); group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); - info!("criterion {} sort took {:.2?}", ci, start.elapsed()); + info!("criterion {} sort took {:.2?}", criterion.name(), start.elapsed()); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { + info!("criterion {} produced a group of size {}", criterion.name(), group.len()); + documents_seen += group.len(); groups.push(group); @@ -237,14 +237,12 @@ where S: Store, let mut distinct_map = DistinctMap::new(self.size); let mut distinct_raw_offset = 0; - 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() { + 'criteria: for criterion in self.inner.criteria.as_ref() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut documents_seen = 0; for group in tmp_groups { - info!("criterion {}, documents group of size {}", ci, group.len()); - // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < distinct_raw_offset { @@ -255,7 +253,7 @@ where S: Store, let start = Instant::now(); group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); - info!("criterion {} sort took {:.2?}", ci, start.elapsed()); + info!("criterion {} sort took {:.2?}", criterion.name(), start.elapsed()); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { // we must compute the real distinguished len of this sub-group @@ -282,6 +280,8 @@ where S: Store, if buf_distinct.len() >= range.end { break } } + info!("criterion {} produced a group of size {}", criterion.name(), group.len()); + documents_seen += group.len(); groups.push(group); From 6df8f62022910dc6c8f0bac035da7ebde00fd3e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 20 May 2019 11:19:25 +0200 Subject: [PATCH 31/32] test: Add more test to some criteria --- meilidb-core/src/criterion/exact.rs | 22 +++++++++++++++++++ .../src/criterion/sum_of_words_attribute.rs | 22 +++++++++++++++++++ .../src/criterion/sum_of_words_position.rs | 22 +++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/meilidb-core/src/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs index b038b9076..bde3ca733 100644 --- a/meilidb-core/src/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -41,3 +41,25 @@ impl Criterion for Exact { "Exact" } } + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: "Soulier bleu" + // doc1: "souliereres rouge" + #[test] + fn easy_case() { + let query_index0 = &[0]; + let is_exact0 = &[true]; + + let query_index1 = &[0]; + let is_exact1 = &[false]; + + let doc0 = number_exact_matches(query_index0, is_exact0); + let doc1 = number_exact_matches(query_index1, is_exact1); + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } +} diff --git a/meilidb-core/src/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs index f5a3c3576..d5787ef3a 100644 --- a/meilidb-core/src/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -40,3 +40,25 @@ impl Criterion for SumOfWordsAttribute { "SumOfWordsAttribute" } } + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } + // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } + #[test] + fn title_vs_description() { + let query_index0 = &[0]; + let attribute0 = &[0]; + + let query_index1 = &[0]; + let attribute1 = &[1]; + + let doc0 = sum_matches_attributes(query_index0, attribute0); + let doc1 = sum_matches_attributes(query_index1, attribute1); + assert_eq!(doc0.cmp(&doc1), Ordering::Less); + } +} diff --git a/meilidb-core/src/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs index e365aef3b..13f26774c 100644 --- a/meilidb-core/src/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -40,3 +40,25 @@ impl Criterion for SumOfWordsPosition { "SumOfWordsPosition" } } + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: "Soulier bleu" + // doc1: "Botte rouge et soulier noir" + #[test] + fn easy_case() { + let query_index0 = &[0]; + let word_index0 = &[0]; + + let query_index1 = &[0]; + let word_index1 = &[3]; + + let doc0 = sum_matches_attribute_index(query_index0, word_index0); + let doc1 = sum_matches_attribute_index(query_index1, word_index1); + assert_eq!(doc0.cmp(&doc1), Ordering::Less); + } +} From 94f9587db1308e613fb490c9764acb25e33066c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 20 May 2019 11:19:56 +0200 Subject: [PATCH 32/32] feat: Implement Debug on RawDocument for more convenience --- meilidb-core/src/lib.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 838d787dd..e61fa543b 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -4,6 +4,7 @@ mod query_builder; mod store; pub mod criterion; +use std::fmt; use std::sync::Arc; use rayon::slice::ParallelSliceMut; @@ -210,6 +211,21 @@ impl RawDocument { } } +impl fmt::Debug for RawDocument { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("RawDocument") + .field("id", &self.id) + .field("query_index", &self.query_index()) + .field("distance", &self.distance()) + .field("attribute", &self.attribute()) + .field("word_index", &self.word_index()) + .field("is_exact", &self.is_exact()) + .field("char_index", &self.char_index()) + .field("char_length", &self.char_length()) + .finish() + } +} + pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); let mut matches2 = Matches::with_capacity(matches.len());