From a43a772e9ad5f17c74eb99582683177d43d48cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 22 Nov 2018 18:28:11 +0100 Subject: [PATCH] feat: Introduce typed keys constructors --- src/blob/mod.rs | 17 +++-- src/data/doc_ids.rs | 3 +- src/index/identifier.rs | 100 ++++++++++++++++++++++++++++ src/index/mod.rs | 21 +++--- src/index/update/negative_update.rs | 20 +++--- src/index/update/positive_update.rs | 27 +++----- 6 files changed, 145 insertions(+), 43 deletions(-) create mode 100644 src/index/identifier.rs diff --git a/src/blob/mod.rs b/src/blob/mod.rs index d00514a93..6d39562b4 100644 --- a/src/blob/mod.rs +++ b/src/blob/mod.rs @@ -16,6 +16,7 @@ use fst::Map; use uuid::Uuid; use rocksdb::rocksdb::{DB, Snapshot}; +use crate::index::identifier::Identifier; use crate::data::DocIndexes; pub enum Blob { @@ -54,6 +55,10 @@ impl BlobName { pub fn new() -> BlobName { BlobName(Uuid::new_v4()) } + + pub fn as_bytes(&self) -> &[u8; 16] { + self.0.as_bytes() + } } impl fmt::Display for BlobName { @@ -113,21 +118,21 @@ pub fn blobs_from_blob_infos(infos: &[BlobInfo], snapshot: &Snapshot<&DB>) -> Re for info in infos { let blob = match info.sign { Sign::Positive => { - let key_map = format!("blob-{}-fst", info.name); - let map = match snapshot.get(key_map.as_bytes())? { + let blob_key = Identifier::blob(info.name).fst_map().build(); + let map = match snapshot.get(&blob_key)? { Some(value) => value.to_vec(), None => return Err(format!("No fst entry found for blob {}", info.name).into()), }; - let key_doc_idx = format!("blob-{}-doc-idx", info.name); - let doc_idx = match snapshot.get(key_doc_idx.as_bytes())? { + let blob_key = Identifier::blob(info.name).document_indexes().build(); + let doc_idx = match snapshot.get(&blob_key)? { Some(value) => value.to_vec(), None => return Err(format!("No doc-idx entry found for blob {}", info.name).into()), }; PositiveBlob::from_bytes(map, doc_idx).map(Blob::Positive)? }, Sign::Negative => { - let key_doc_ids = format!("blob-{}-doc-ids", info.name); - let doc_ids = match snapshot.get(key_doc_ids.as_bytes())? { + let blob_key = Identifier::blob(info.name).document_ids().build(); + let doc_ids = match snapshot.get(&blob_key)? { Some(value) => value.to_vec(), None => return Err(format!("No doc-ids entry found for blob {}", info.name).into()), }; diff --git a/src/data/doc_ids.rs b/src/data/doc_ids.rs index 00ae2f024..4907a29f8 100644 --- a/src/data/doc_ids.rs +++ b/src/data/doc_ids.rs @@ -23,7 +23,8 @@ impl DocIds { Ok(DocIds { doc_ids }) } - pub fn from_bytes(vec: Vec) -> io::Result { + pub fn from_bytes(vec: Vec) -> Result> { + // FIXME check if modulo DocumentId let len = vec.len(); let doc_ids = Data::Shared { vec: Arc::new(vec), diff --git a/src/index/identifier.rs b/src/index/identifier.rs new file mode 100644 index 000000000..0a2f3f3d4 --- /dev/null +++ b/src/index/identifier.rs @@ -0,0 +1,100 @@ +use std::io::Write; + +use byteorder::{NetworkEndian, WriteBytesExt}; + +use crate::index::schema::SchemaAttr; +use crate::blob::BlobName; +use crate::DocumentId; + +pub struct Identifier { + inner: Vec, +} + +impl Identifier { + pub fn data() -> Data { + let mut inner = Vec::new(); + let _ = inner.write(b"data"); + Data { inner } + } + + pub fn blob(name: BlobName) -> Blob { + let mut inner = Vec::new(); + let _ = inner.write(b"blob"); + let _ = inner.write(name.as_bytes()); + Blob { inner } + } + + pub fn document(id: DocumentId) -> Document { + let mut inner = Vec::new(); + let _ = inner.write(b"docu"); + let _ = inner.write(b"-"); + let _ = inner.write_u64::(id); + Document { inner } + } +} + +pub struct Data { + inner: Vec, +} + +impl Data { + pub fn blobs_order(mut self) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write(b"blobs-order"); + self + } + + pub fn schema(mut self) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write(b"schema"); + self + } + + pub fn build(self) -> Vec { + self.inner + } +} + +pub struct Blob { + inner: Vec, +} + +impl Blob { + pub fn document_indexes(mut self) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write(b"doc-idx"); + self + } + + pub fn document_ids(mut self) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write(b"doc-ids"); + self + } + + pub fn fst_map(mut self) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write(b"fst"); + self + } + + pub fn build(self) -> Vec { + self.inner + } +} + +pub struct Document { + inner: Vec, +} + +impl Document { + pub fn attribute(mut self, attr: SchemaAttr) -> Self { + let _ = self.inner.write(b"-"); + let _ = self.inner.write_u32::(attr.as_u32()); + self + } + + pub fn build(self) -> Vec { + self.inner + } +} diff --git a/src/index/mod.rs b/src/index/mod.rs index d50e0706d..f03c8848e 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1,3 +1,4 @@ +pub mod identifier; pub mod schema; pub mod update; @@ -20,18 +21,12 @@ use crate::data::DocIdsBuilder; use crate::{DocIndex, DocumentId}; use crate::index::schema::Schema; use crate::index::update::Update; +use crate::index::identifier::Identifier; use crate::blob::{PositiveBlobBuilder, BlobInfo, Sign, Blob, blobs_from_blob_infos}; use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer}; use crate::rank::{criterion, Config, RankedStream}; use crate::automaton; -const DATA_PREFIX: &str = "data"; -const BLOB_PREFIX: &str = "blob"; -const DOCU_PREFIX: &str = "docu"; - -const DATA_BLOBS_ORDER: &str = "data-blobs-order"; -const DATA_SCHEMA: &str = "data-schema"; - fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { let mut output = Vec::new(); for bytes in operands.chain(value) { @@ -67,7 +62,8 @@ impl Index { let mut schema_bytes = Vec::new(); schema.write_to(&mut schema_bytes)?; - database.put(DATA_SCHEMA.as_bytes(), &schema_bytes)?; + let data_key = Identifier::data().schema().build(); + database.put(&data_key, &schema_bytes)?; Ok(Self { database }) } @@ -83,7 +79,8 @@ impl Index { let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; - let _schema = match database.get(DATA_SCHEMA.as_bytes())? { + let data_key = Identifier::data().schema().build(); + let _schema = match database.get(&data_key)? { Some(value) => Schema::read_from(&*value)?, None => return Err(String::from("Database does not contain a schema").into()), }; @@ -105,7 +102,8 @@ impl Index { } pub fn schema(&self) -> Result> { - let bytes = self.database.get(DATA_SCHEMA.as_bytes())?.expect("data-schema entry not found"); + let data_key = Identifier::data().schema().build(); + let bytes = self.database.get(&data_key)?.expect("data-schema entry not found"); Ok(Schema::read_from(&*bytes).expect("Invalid schema")) } @@ -113,7 +111,8 @@ impl Index { // this snapshot will allow consistent reads for the whole search operation let snapshot = self.database.snapshot(); - let blobs = match snapshot.get(DATA_BLOBS_ORDER.as_bytes())? { + let data_key = Identifier::data().blobs_order().build(); + let blobs = match snapshot.get(&data_key)? { Some(value) => { let blob_infos = BlobInfo::read_from_slice(&value)?; blobs_from_blob_infos(&blob_infos, &snapshot)? diff --git a/src/index/update/negative_update.rs b/src/index/update/negative_update.rs index 20356b5d1..ef5d61c05 100644 --- a/src/index/update/negative_update.rs +++ b/src/index/update/negative_update.rs @@ -4,9 +4,9 @@ use std::error::Error; use ::rocksdb::rocksdb_options; use crate::blob::BlobInfo; -use crate::index::DATA_BLOBS_ORDER; use crate::index::update::Update; -use crate::data::DocIdsBuilder; +use crate::index::identifier::Identifier; +use crate::data::{DocIds, DocIdsBuilder}; use crate::DocumentId; pub struct NegativeUpdateBuilder { @@ -35,21 +35,23 @@ impl NegativeUpdateBuilder { file_writer.open(&self.path.to_string_lossy())?; // write the doc ids - let blob_key = format!("blob-{}-doc-ids", blob_info.name); + let blob_key = Identifier::blob(blob_info.name).document_ids().build(); let blob_doc_ids = self.doc_ids.into_inner()?; - file_writer.put(blob_key.as_bytes(), &blob_doc_ids)?; + file_writer.put(&blob_key, &blob_doc_ids)?; { // write the blob name to be merged let mut buffer = Vec::new(); blob_info.write_into(&mut buffer); - file_writer.merge(DATA_BLOBS_ORDER.as_bytes(), &buffer)?; + let data_key = Identifier::data().blobs_order().build(); + file_writer.merge(&data_key, &buffer)?; } - for id in blob_doc_ids { - let start = format!("docu-{}", id); - let end = format!("docu-{}", id + 1); - file_writer.delete_range(start.as_bytes(), end.as_bytes())?; + let blob_doc_ids = DocIds::from_bytes(blob_doc_ids)?; + for id in blob_doc_ids.doc_ids().iter().cloned() { + let start = Identifier::document(id).build(); + let end = Identifier::document(id + 1).build(); + file_writer.delete_range(&start, &end)?; } file_writer.finish()?; diff --git a/src/index/update/positive_update.rs b/src/index/update/positive_update.rs index 9378951ce..ad6e0f5d0 100644 --- a/src/index/update/positive_update.rs +++ b/src/index/update/positive_update.rs @@ -5,8 +5,8 @@ use std::fmt::Write; use ::rocksdb::rocksdb_options; -use crate::index::DATA_BLOBS_ORDER; use crate::index::update::Update; +use crate::index::identifier::Identifier; use crate::index::schema::{SchemaProps, Schema, SchemaAttr}; use crate::tokenizer::TokenizerBuilder; use crate::blob::{BlobInfo, PositiveBlobBuilder}; @@ -88,34 +88,29 @@ where B: TokenizerBuilder let (blob_fst_map, blob_doc_idx) = builder.into_inner()?; // write the doc-idx - let blob_key = format!("blob-{}-doc-idx", blob_info.name); - file_writer.put(blob_key.as_bytes(), &blob_doc_idx)?; + let blob_key = Identifier::blob(blob_info.name).document_indexes().build(); + file_writer.put(&blob_key, &blob_doc_idx)?; // write the fst - let blob_key = format!("blob-{}-fst", blob_info.name); - file_writer.put(blob_key.as_bytes(), &blob_fst_map)?; + let blob_key = Identifier::blob(blob_info.name).fst_map().build(); + file_writer.put(&blob_key, &blob_fst_map)?; { // write the blob name to be merged let mut buffer = Vec::new(); blob_info.write_into(&mut buffer); - file_writer.merge(DATA_BLOBS_ORDER.as_bytes(), &buffer)?; + let data_key = Identifier::data().blobs_order().build(); + file_writer.merge(&data_key, &buffer)?; } // write all the documents fields updates - let mut key = String::from("docu-"); - let prefix_len = key.len(); - - // FIXME write numbers in bytes not decimal representation - - for ((id, field), state) in self.new_states { - key.truncate(prefix_len); - write!(&mut key, "{}-{}", id, field)?; + for ((id, attr), state) in self.new_states { + let key = Identifier::document(id).attribute(attr).build(); match state { NewState::Updated { value, props } => if props.is_stored() { - file_writer.put(key.as_bytes(), value.as_bytes())? + file_writer.put(&key, value.as_bytes())? }, - NewState::Removed => file_writer.delete(key.as_bytes())?, + NewState::Removed => file_writer.delete(&key)?, } }