From 0c180262402c27214ec1c974a22bc88f9d28b4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 6 May 2019 14:13:09 +0200 Subject: [PATCH] feat: Introduce Tree wrappers for each index component --- meilidb-data/src/database.rs | 126 +++++++++++++++++++++---- meilidb-data/src/document_attr_key.rs | 69 ++++++++++++++ meilidb-data/src/lib.rs | 1 + meilidb-data/src/schema.rs | 14 +-- meilidb-data/src/serde/deserializer.rs | 2 +- 5 files changed, 186 insertions(+), 26 deletions(-) create mode 100644 meilidb-data/src/document_attr_key.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 23d998886..667baf2e0 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,4 +1,5 @@ use std::collections::HashSet; +use std::convert::TryInto; use std::io::{self, Cursor, BufRead}; use std::iter::FromIterator; use std::path::Path; @@ -8,15 +9,17 @@ use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; use byteorder::{ReadBytesExt, BigEndian}; use hashbrown::HashMap; -use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId}; +use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId, DocIndex}; use rmp_serde::decode::{Error as RmpError}; use sdset::SetBuf; use serde::de; use sled::IVec; +use zerocopy::{AsBytes, LayoutVerified}; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; use crate::indexer::{Indexer, WordIndexTree}; +use crate::document_attr_key::DocumentAttrKey; pub type WordIndex = meilidb_core::Index; @@ -27,6 +30,7 @@ pub enum Error { WordIndexMissing, MissingDocumentId, SledError(sled::Error), + FstError(fst::Error), BincodeError(bincode::Error), SerializerError(SerializerError), } @@ -37,6 +41,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::FstError(error) + } +} + impl From for Error { fn from(error: bincode::Error) -> Error { Error::BincodeError(error) @@ -58,6 +68,7 @@ impl fmt::Display for Error { WordIndexMissing => write!(f, "this index does not have a word index"), MissingDocumentId => write!(f, "document id is missing"), SledError(e) => write!(f, "sled error; {}", e), + FstError(e) => write!(f, "fst error; {}", e), BincodeError(e) => write!(f, "bincode error; {}", e), SerializerError(e) => write!(f, "serializer error; {}", e), } @@ -180,6 +191,102 @@ impl Database { } } +struct RawIndex2 { + main: MainIndex, + words: WordsIndex, + documents: DocumentsIndex, +} + +struct MainIndex(Arc); + +impl MainIndex { + fn schema(&self) -> Result, Error> { + match self.0.get("schema")? { + Some(bytes) => { + let schema = Schema::read_from_bin(bytes.as_ref())?; + Ok(Some(schema)) + }, + None => Ok(None), + } + } + + fn words_set(&self) -> Result, Error> { + match self.0.get("words")? { + Some(bytes) => { + let len = bytes.len(); + let value = bytes.into(); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + fn ranked_map(&self) -> Result, Error> { + match self.0.get("ranked-map")? { + Some(bytes) => { + let ranked_map = bincode::deserialize(bytes.as_ref())?; + Ok(Some(ranked_map)) + }, + None => Ok(None), + } + } +} + +struct WordsIndex(Arc); + +impl WordsIndex { + fn doc_indexes(&self, word: &[u8]) -> Result>, Error> { + match self.0.get(word)? { + Some(bytes) => { + let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout"); + let slice = layout.into_slice(); + let setbuf = SetBuf::new_unchecked(slice.to_vec()); + Ok(Some(setbuf)) + }, + None => Ok(None), + } + } +} + +struct DocumentsIndex(Arc); + +impl DocumentsIndex { + fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { + let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + self.0.get(key).map_err(Into::into) + } + + fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + let start = DocumentAttrKey::new(id, SchemaAttr::min()); + let start = start.to_be_bytes(); + + let end = DocumentAttrKey::new(id, SchemaAttr::max()); + let end = end.to_be_bytes(); + + DocumentFieldsIter(self.0.range(start..=end)) + } +} + +pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentFieldsIter<'a> { + type Item = Result<(SchemaAttr, IVec), Error>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, value))) => { + let slice: &[u8] = key.as_ref(); + let array = slice.try_into().unwrap(); + let key = DocumentAttrKey::from_be_bytes(array); + Some(Ok((key.attribute, value))) + }, + Some(Err(e)) => Some(Err(Error::SledError(e))), + None => None, + } + } +} + #[derive(Clone)] pub struct RawIndex { schema: Schema, @@ -294,23 +401,6 @@ impl RawIndex { } } -pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); - -impl<'a> Iterator for DocumentFieldsIter<'a> { - type Item = Result<(DocumentId, SchemaAttr, IVec), Error>; - - fn next(&mut self) -> Option { - match self.0.next() { - Some(Ok((key, value))) => { - let (id, attr) = extract_document_key(key).unwrap(); - Some(Ok((id, attr, value))) - }, - Some(Err(e)) => Some(Err(Error::SledError(e))), - None => None, - } - } -} - #[derive(Clone)] pub struct Index(RawIndex); diff --git a/meilidb-data/src/document_attr_key.rs b/meilidb-data/src/document_attr_key.rs new file mode 100644 index 000000000..d8cb0bc8d --- /dev/null +++ b/meilidb-data/src/document_attr_key.rs @@ -0,0 +1,69 @@ +use meilidb_core::DocumentId; +use crate::schema::SchemaAttr; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct DocumentAttrKey { + pub document_id: DocumentId, + pub attribute: SchemaAttr, +} + +impl DocumentAttrKey { + pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey { + DocumentAttrKey { document_id, attribute } + } + + pub fn to_be_bytes(self) -> [u8; 10] { + let mut output = [0u8; 10]; + + let document_id = self.document_id.0.to_be_bytes(); + let attribute = self.attribute.0.to_be_bytes(); + + unsafe { + use std::{mem::size_of, ptr::copy_nonoverlapping}; + + let output = output.as_mut_ptr(); + copy_nonoverlapping(document_id.as_ptr(), output, size_of::()); + + let output = output.add(size_of::()); + copy_nonoverlapping(attribute.as_ptr(), output, size_of::()); + } + + output + } + + pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey { + let document_id; + let attribute; + + unsafe { + use std::ptr::read_unaligned; + + let pointer = bytes.as_ptr() as *const _; + let document_id_bytes = read_unaligned(pointer); + document_id = u64::from_be_bytes(document_id_bytes); + + let pointer = pointer.add(1) as *const _; + let attribute_bytes = read_unaligned(pointer); + attribute = u16::from_be_bytes(attribute_bytes); + } + + DocumentAttrKey { + document_id: DocumentId(document_id), + attribute: SchemaAttr(attribute), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn to_from_be_bytes() { + let document_id = DocumentId(67578308); + let schema_attr = SchemaAttr(3456); + let x = DocumentAttrKey::new(document_id, schema_attr); + + assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes())); + } +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 03f09728a..149a65fa8 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,4 +1,5 @@ mod database; +mod document_attr_key; mod indexer; mod number; mod ranked_map; diff --git a/meilidb-data/src/schema.rs b/meilidb-data/src/schema.rs index bff7806dc..cc39cf0e8 100644 --- a/meilidb-data/src/schema.rs +++ b/meilidb-data/src/schema.rs @@ -186,12 +186,16 @@ impl Schema { pub struct SchemaAttr(pub u16); impl SchemaAttr { - pub fn new(value: u16) -> SchemaAttr { + pub const fn new(value: u16) -> SchemaAttr { SchemaAttr(value) } - pub fn min() -> SchemaAttr { - SchemaAttr(0) + pub const fn min() -> SchemaAttr { + SchemaAttr(u16::min_value()) + } + + pub const fn max() -> SchemaAttr { + SchemaAttr(u16::max_value()) } pub fn next(self) -> Option { @@ -201,10 +205,6 @@ impl SchemaAttr { pub fn prev(self) -> Option { self.0.checked_sub(1).map(SchemaAttr) } - - pub fn max() -> SchemaAttr { - SchemaAttr(u16::MAX) - } } impl fmt::Display for SchemaAttr { diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-data/src/serde/deserializer.rs index 12873713b..dbebd8abf 100644 --- a/meilidb-data/src/serde/deserializer.rs +++ b/meilidb-data/src/serde/deserializer.rs @@ -45,7 +45,7 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> }, } }); - let iter = document_attributes.filter_map(|(_, attr, value)| { + let iter = document_attributes.filter_map(|(attr, value)| { if self.fields.map_or(true, |f| f.contains(&attr)) { let attribute_name = self.raw_index.schema().attribute_name(attr); Some((attribute_name, Value::new(value)))