diff --git a/src/blob/mod.rs b/src/blob/mod.rs index 6d39562b4..7b404f3ce 100644 --- a/src/blob/mod.rs +++ b/src/blob/mod.rs @@ -15,6 +15,8 @@ use std::{io, fmt, mem}; use fst::Map; use uuid::Uuid; use rocksdb::rocksdb::{DB, Snapshot}; +use serde::ser::{Serialize, Serializer, SerializeTuple}; +use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; use crate::index::identifier::Identifier; use crate::data::DocIndexes; @@ -33,6 +35,65 @@ impl Blob { } } +impl Serialize for Blob { + fn serialize(&self, serializer: S) -> Result { + match self { + Blob::Positive(blob) => { + let mut tuple = serializer.serialize_tuple(2)?; + tuple.serialize_element(&Sign::Positive)?; + tuple.serialize_element(&blob)?; + tuple.end() + }, + Blob::Negative(blob) => { + let mut tuple = serializer.serialize_tuple(2)?; + tuple.serialize_element(&Sign::Negative)?; + tuple.serialize_element(&blob)?; + tuple.end() + }, + } + } +} + +impl<'de> Deserialize<'de> for Blob { + fn deserialize>(deserializer: D) -> Result { + struct TupleVisitor; + + impl<'de> Visitor<'de> for TupleVisitor { + type Value = Blob; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a Blob struct") + } + + #[inline] + fn visit_seq>(self, mut seq: A) -> Result { + let sign = match seq.next_element()? { + Some(value) => value, + None => return Err(de::Error::invalid_length(0, &self)), + }; + match sign { + Sign::Positive => { + let blob = match seq.next_element()? { + Some(value) => value, + None => return Err(de::Error::invalid_length(1, &self)), + }; + Ok(Blob::Positive(blob)) + }, + Sign::Negative => { + let blob = match seq.next_element()? { + Some(value) => value, + None => return Err(de::Error::invalid_length(1, &self)), + }; + Ok(Blob::Negative(blob)) + }, + } + } + } + + deserializer.deserialize_tuple(2, TupleVisitor) + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum Sign { Positive, diff --git a/src/blob/negative_blob.rs b/src/blob/negative_blob.rs index 0226b5ae5..19ff5d026 100644 --- a/src/blob/negative_blob.rs +++ b/src/blob/negative_blob.rs @@ -1,9 +1,11 @@ +use std::io::{Read, Write}; use std::error::Error; use std::path::Path; -use std::io::Write; use crate::DocumentId; use crate::data::{DocIds, DocIdsBuilder}; +use serde::ser::{Serialize, Serializer}; +use serde::de::{self, Deserialize, Deserializer}; pub struct NegativeBlob { doc_ids: DocIds, @@ -31,6 +33,19 @@ impl NegativeBlob { } } +impl Serialize for NegativeBlob { + fn serialize(&self, serializer: S) -> Result { + self.doc_ids.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for NegativeBlob { + fn deserialize>(deserializer: D) -> Result { + let bytes = Vec::deserialize(deserializer)?; + NegativeBlob::from_bytes(bytes).map_err(de::Error::custom) + } +} + pub struct NegativeBlobBuilder { doc_ids: DocIdsBuilder, } diff --git a/src/blob/positive_blob.rs b/src/blob/positive_blob.rs index ac1e85d46..29966cd68 100644 --- a/src/blob/positive_blob.rs +++ b/src/blob/positive_blob.rs @@ -1,11 +1,14 @@ +use std::io::{Read, Write}; use std::error::Error; use std::path::Path; -use std::io::Write; +use std::fmt; use fst::{Map, MapBuilder}; use crate::DocIndex; use crate::data::{DocIndexes, DocIndexesBuilder}; +use serde::ser::{Serialize, Serializer, SerializeTuple}; +use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; pub struct PositiveBlob { map: Map, @@ -45,6 +48,52 @@ impl PositiveBlob { } } +impl Serialize for PositiveBlob { + fn serialize(&self, serializer: S) -> Result { + let mut tuple = serializer.serialize_tuple(2)?; + tuple.serialize_element(&self.map.as_fst().to_vec())?; + tuple.serialize_element(&self.indexes)?; + tuple.end() + } +} + +impl<'de> Deserialize<'de> for PositiveBlob { + fn deserialize>(deserializer: D) -> Result { + struct TupleVisitor; + + impl<'de> Visitor<'de> for TupleVisitor { + type Value = PositiveBlob; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a PositiveBlob struct") + } + + #[inline] + fn visit_seq>(self, mut seq: A) -> Result { + let map = match seq.next_element()? { + Some(bytes) => match Map::from_bytes(bytes) { + Ok(value) => value, + Err(err) => return Err(de::Error::custom(err)), + }, + None => return Err(de::Error::invalid_length(0, &self)), + }; + + let indexes = match seq.next_element()? { + Some(bytes) => match DocIndexes::from_bytes(bytes) { + Ok(value) => value, + Err(err) => return Err(de::Error::custom(err)), + }, + None => return Err(de::Error::invalid_length(1, &self)), + }; + + Ok(PositiveBlob { map, indexes }) + } + } + + deserializer.deserialize_tuple(2, TupleVisitor) + } +} + pub struct PositiveBlobBuilder { map: W, indexes: DocIndexesBuilder, diff --git a/src/data/doc_ids.rs b/src/data/doc_ids.rs index 4907a29f8..168ca0b46 100644 --- a/src/data/doc_ids.rs +++ b/src/data/doc_ids.rs @@ -7,31 +7,32 @@ use std::{io, mem}; use byteorder::{NativeEndian, WriteBytesExt}; use fst::raw::MmapReadOnly; +use serde::ser::{Serialize, Serializer}; use crate::DocumentId; use crate::data::Data; #[derive(Clone)] pub struct DocIds { - doc_ids: Data, + data: Data, } impl DocIds { pub unsafe fn from_path>(path: P) -> io::Result { let mmap = MmapReadOnly::open_path(path)?; - let doc_ids = Data::Mmap(mmap); - Ok(DocIds { doc_ids }) + let data = Data::Mmap(mmap); + Ok(DocIds { data }) } pub fn from_bytes(vec: Vec) -> Result> { // FIXME check if modulo DocumentId let len = vec.len(); - let doc_ids = Data::Shared { + let data = Data::Shared { vec: Arc::new(vec), offset: 0, len: len }; - Ok(DocIds { doc_ids }) + Ok(DocIds { data }) } pub fn contains(&self, doc: DocumentId) -> bool { @@ -40,13 +41,19 @@ impl DocIds { } pub fn doc_ids(&self) -> &[DocumentId] { - let slice = &self.doc_ids; + let slice = &self.data; let ptr = slice.as_ptr() as *const DocumentId; let len = slice.len() / mem::size_of::(); unsafe { from_raw_parts(ptr, len) } } } +impl Serialize for DocIds { + fn serialize(&self, serializer: S) -> Result { + self.data.as_ref().serialize(serializer) + } +} + pub struct DocIdsBuilder { doc_ids: BTreeSet, // TODO: prefer a linked-list wrt: W, diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index bb0cea837..97222b205 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -8,6 +8,7 @@ use std::mem; use fst::raw::MmapReadOnly; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use serde::ser::{Serialize, Serializer, SerializeTuple}; use crate::DocIndex; use crate::data::Data; @@ -84,6 +85,15 @@ impl DocIndexes { } } +impl Serialize for DocIndexes { + fn serialize(&self, serializer: S) -> Result { + let mut tuple = serializer.serialize_tuple(2)?; + tuple.serialize_element(self.ranges.as_ref())?; + tuple.serialize_element(self.indexes.as_ref())?; + tuple.end() + } +} + pub struct DocIndexesBuilder { keys: BTreeMap, indexes: Vec>, diff --git a/src/data/mod.rs b/src/data/mod.rs index 0f16621fb..ca9816fa3 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -23,6 +23,12 @@ impl Deref for Data { type Target = [u8]; fn deref(&self) -> &Self::Target { + self.as_ref() + } +} + +impl AsRef<[u8]> for Data { + fn as_ref(&self) -> &[u8] { match self { Data::Shared { vec, offset, len } => { &vec[*offset..offset + len] diff --git a/src/index/mod.rs b/src/index/mod.rs index f03c8848e..9de25f476 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -22,7 +22,7 @@ use crate::{DocIndex, DocumentId}; use crate::index::schema::Schema; use crate::index::update::Update; use crate::index::identifier::Identifier; -use crate::blob::{PositiveBlobBuilder, BlobInfo, Sign, Blob, blobs_from_blob_infos}; +use crate::blob::{PositiveBlobBuilder, PositiveBlob, BlobInfo, Sign, Blob, blobs_from_blob_infos}; use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer}; use crate::rank::{criterion, Config, RankedStream}; use crate::automaton; @@ -35,6 +35,45 @@ fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOpera output } +pub struct MergeBuilder { + blobs: Vec, +} + +impl MergeBuilder { + pub fn new() -> MergeBuilder { + MergeBuilder { blobs: Vec::new() } + } + + pub fn push(&mut self, blob: Blob) { + if blob.sign() == Sign::Negative && self.blobs.is_empty() { return } + self.blobs.push(blob); + } + + pub fn merge(self) -> PositiveBlob { + unimplemented!() + } +} + +fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { + if key != b"data-index" { panic!("The merge operator only allow \"data-index\" merging") } + + let mut merge_builder = MergeBuilder::new(); + + if let Some(existing_value) = existing_value { + let base: PositiveBlob = bincode::deserialize(existing_value).unwrap(); // FIXME what do we do here ? + merge_builder.push(Blob::Positive(base)); + } + + for bytes in operands { + let blob: Blob = bincode::deserialize(bytes).unwrap(); + merge_builder.push(blob); + } + + let blob = merge_builder.merge(); + // blob.to_vec() + unimplemented!() +} + pub struct Index { database: rocksdb::DB, }