From 731ed111537e727e84815cc236e0f3eb80702eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 7 Dec 2018 11:32:27 +0100 Subject: [PATCH] feat: Index and store/serialize attributes while creating the update --- src/database/database_view.rs | 17 +- src/database/mod.rs | 50 ++--- src/index/update/positive/update.rs | 278 +++++++++++++++++++++------- src/lib.rs | 1 - src/rank/ranked_stream.rs | 4 +- src/retrieve.rs | 60 ------ src/tokenizer/mod.rs | 46 ----- 7 files changed, 240 insertions(+), 216 deletions(-) delete mode 100644 src/retrieve.rs diff --git a/src/database/database_view.rs b/src/database/database_view.rs index 91f097660..20f3340e5 100644 --- a/src/database/database_view.rs +++ b/src/database/database_view.rs @@ -9,24 +9,9 @@ use crate::database::deserializer::{Deserializer, DeserializerError}; use crate::database::{DATA_INDEX, DATA_SCHEMA}; use crate::blob::positive::PositiveBlob; use crate::index::schema::Schema; -use crate::database::{DocumentKey, DocumentKeyAttr}; +use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr}; use crate::DocumentId; -// FIXME Do not panic! -fn retrieve_data_schema(snapshot: &Snapshot<&DB>) -> Result> { - match snapshot.get(DATA_SCHEMA)? { - Some(vector) => Ok(Schema::read_from(&*vector)?), - None => panic!("BUG: no schema found in the database"), - } -} - -fn retrieve_data_index(snapshot: &Snapshot<&DB>) -> Result> { - match snapshot.get(DATA_INDEX)? { - Some(vector) => Ok(bincode::deserialize(&*vector)?), - None => Ok(PositiveBlob::default()), - } -} - pub struct DatabaseView<'a> { snapshot: Snapshot<&'a DB>, schema: Schema, diff --git a/src/database/mod.rs b/src/database/mod.rs index 246071a1a..a990fffae 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,15 +1,17 @@ use std::error::Error; use std::path::Path; +use std::ops::Deref; use std::fmt; use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions}; use rocksdb::{DB, DBVector, MergeOperands, SeekKey}; -use rocksdb::rocksdb::Writable; +use rocksdb::rocksdb::{Writable, Snapshot}; -pub use crate::database::database_view::DatabaseView; pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; +pub use crate::database::database_view::DatabaseView; use crate::index::update::Update; use crate::index::schema::Schema; +use crate::blob::positive::PositiveBlob; use crate::blob::{self, Blob}; mod document_key; @@ -19,6 +21,24 @@ mod deserializer; const DATA_INDEX: &[u8] = b"data-index"; const DATA_SCHEMA: &[u8] = b"data-schema"; +pub fn retrieve_data_schema(snapshot: &Snapshot) -> Result> +where D: Deref +{ + match snapshot.get(DATA_SCHEMA)? { + Some(vector) => Ok(Schema::read_from(&*vector)?), + None => Err(String::from("BUG: no schema found in the database").into()), + } +} + +pub fn retrieve_data_index(snapshot: &Snapshot) -> Result> +where D: Deref +{ + match snapshot.get(DATA_INDEX)? { + Some(vector) => Ok(bincode::deserialize(&*vector)?), + None => Ok(PositiveBlob::default()), + } +} + pub struct Database(DB); impl Database { @@ -162,14 +182,14 @@ mod tests { struct SimpleDoc { title: String, description: String, + timestamp: u64, } - let title; - let description; let schema = { let mut builder = SchemaBuilder::new(); - title = builder.new_attribute("title", STORED | INDEXED); - description = builder.new_attribute("description", STORED | INDEXED); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + builder.new_attribute("timestamp", STORED); builder.build() }; @@ -181,21 +201,17 @@ mod tests { let doc0 = SimpleDoc { title: String::from("I am a title"), description: String::from("I am a description"), + timestamp: 1234567, }; let doc1 = SimpleDoc { title: String::from("I am the second title"), description: String::from("I am the second description"), + timestamp: 7654321, }; let mut update = { let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder); - // builder.update_field(0, title, doc0.title.clone()); - // builder.update_field(0, description, doc0.description.clone()); - - // builder.update_field(1, title, doc1.title.clone()); - // builder.update_field(1, description, doc1.description.clone()); - builder.update(0, &doc0).unwrap(); builder.update(1, &doc1).unwrap(); @@ -206,19 +222,9 @@ mod tests { database.ingest_update_file(update)?; let view = database.view()?; - println!("{:?}", view); - - #[derive(Deserialize, Debug, Clone, PartialEq, Eq)] - struct DeSimpleDoc { - title: char, - } - let de_doc0: SimpleDoc = view.retrieve_document(0)?; let de_doc1: SimpleDoc = view.retrieve_document(1)?; - println!("{:?}", de_doc0); - println!("{:?}", de_doc1); - assert_eq!(doc0, de_doc0); assert_eq!(doc1, de_doc1); diff --git a/src/index/update/positive/update.rs b/src/index/update/positive/update.rs index c5154d270..da25ffb41 100644 --- a/src/index/update/positive/update.rs +++ b/src/index/update/positive/update.rs @@ -25,6 +25,7 @@ pub struct PositiveUpdateBuilder { path: PathBuf, schema: Schema, tokenizer_builder: B, + builder: UnorderedPositiveBlobBuilder, Vec>, new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>, } @@ -34,14 +35,19 @@ impl PositiveUpdateBuilder { path: path.into(), schema: schema, tokenizer_builder: tokenizer_builder, + builder: UnorderedPositiveBlobBuilder::memory(), new_states: BTreeMap::new(), } } - pub fn update(&mut self, id: DocumentId, document: &T) -> Result<(), Box> { + pub fn update(&mut self, id: DocumentId, document: &T) -> Result<(), Box> + where B: TokenizerBuilder + { let serializer = Serializer { schema: &self.schema, document_id: id, + tokenizer_builder: &self.tokenizer_builder, + builder: &mut self.builder, new_states: &mut self.new_states }; @@ -90,9 +96,11 @@ impl fmt::Display for SerializerError { impl Error for SerializerError {} -struct Serializer<'a> { +struct Serializer<'a, B> { schema: &'a Schema, + tokenizer_builder: &'a B, document_id: DocumentId, + builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>, } @@ -106,7 +114,9 @@ macro_rules! forward_to_unserializable_type { } } -impl<'a> ser::Serializer for Serializer<'a> { +impl<'a, B> ser::Serializer for Serializer<'a, B> +where B: TokenizerBuilder +{ type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; @@ -114,7 +124,7 @@ impl<'a> ser::Serializer for Serializer<'a> { type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; type SerializeMap = ser::Impossible; - type SerializeStruct = StructSerializer<'a>; + type SerializeStruct = StructSerializer<'a, B>; type SerializeStructVariant = ser::Impossible; forward_to_unserializable_type! { @@ -238,7 +248,9 @@ impl<'a> ser::Serializer for Serializer<'a> { { Ok(StructSerializer { schema: self.schema, + tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, + builder: self.builder, new_states: self.new_states, }) } @@ -255,33 +267,17 @@ impl<'a> ser::Serializer for Serializer<'a> { } } -fn serialize_field( - schema: &Schema, - document_id: DocumentId, - new_states: &mut BTreeMap<(DocumentId, SchemaAttr), NewState>, - name: &str, - value: Vec, -) -> Result<(), SerializerError> -{ - match schema.attribute(name) { - Some(attr) => { - let props = schema.props(attr); - if props.is_stored() { - new_states.insert((document_id, attr), NewState::Updated { value }); - } - Ok(()) - }, - None => Err(SerializerError::SchemaDontMatch { attribute: name.to_owned() }), - } -} - -struct StructSerializer<'a> { +struct StructSerializer<'a, B> { schema: &'a Schema, + tokenizer_builder: &'a B, document_id: DocumentId, + builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>, } -impl<'a> ser::SerializeStruct for StructSerializer<'a> { +impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> +where B: TokenizerBuilder +{ type Ok = (); type Error = SerializerError; @@ -292,11 +288,26 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { ) -> Result<(), Self::Error> where T: Serialize, { - let value = match bincode::serialize(value) { - Ok(value) => value, - Err(e) => return Err(SerializerError::UnserializableType { name: "???" }), - }; - serialize_field(self.schema, self.document_id, self.new_states, key, value) + match self.schema.attribute(key) { + Some(attr) => { + let props = self.schema.props(attr); + if props.is_stored() { + let value = bincode::serialize(value).unwrap(); + self.new_states.insert((self.document_id, attr), NewState::Updated { value }); + } + if props.is_indexed() { + let serializer = IndexerSerializer { + builder: self.builder, + tokenizer_builder: self.tokenizer_builder, + document_id: self.document_id, + attribute: attr, + }; + value.serialize(serializer)?; + } + Ok(()) + }, + None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }), + } } fn end(self) -> Result { @@ -304,52 +315,181 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { } } -impl PositiveUpdateBuilder +struct IndexerSerializer<'a, B> { + tokenizer_builder: &'a B, + builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + document_id: DocumentId, + attribute: SchemaAttr, +} + +impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> where B: TokenizerBuilder { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, v: &str) -> Result { + for (index, word) in self.tokenizer_builder.build(v) { + let doc_index = DocIndex { + document_id: self.document_id, + attribute: self.attribute.as_u32() as u8, + attribute_index: index as u32, + }; + + // insert the exact representation + let word_lower = word.to_lowercase(); + + // and the unidecoded lowercased version + let word_unidecoded = unidecode::unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + self.builder.insert(word_unidecoded, doc_index); + } + + self.builder.insert(word_lower, doc_index); + } + Ok(()) + } + + fn serialize_bytes(self, v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "seq" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} + +impl PositiveUpdateBuilder { pub fn build(self) -> Result> { let env_options = rocksdb_options::EnvOptions::new(); let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); file_writer.open(&self.path.to_string_lossy())?; - let mut builder = UnorderedPositiveBlobBuilder::memory(); - for ((document_id, attr), state) in &self.new_states { - let props = self.schema.props(*attr); - let value = match state { - NewState::Updated { value } if props.is_indexed() => value, - _ => continue, - }; - - let value: String = match bincode::deserialize(&value) { - Ok(value) => value, - Err(e) => { - eprintln!("{}", e); - continue - }, - }; - - for (index, word) in self.tokenizer_builder.build(&value) { - let doc_index = DocIndex { - document_id: *document_id, - attribute: attr.as_u32() as u8, - attribute_index: index as u32, - }; - - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); - } - - builder.insert(word_lower, doc_index); - } - } - - let (blob_fst_map, blob_doc_idx) = builder.into_inner()?; + let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?; let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?; let blob = Blob::Positive(positive_blob); diff --git a/src/lib.rs b/src/lib.rs index ecb48b9b9..f83e59c37 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,6 @@ pub mod automaton; pub mod blob; pub mod database; pub mod data; -pub mod retrieve; pub mod index; pub mod rank; pub mod tokenizer; diff --git a/src/rank/ranked_stream.rs b/src/rank/ranked_stream.rs index df32f9f32..6dba99c9a 100644 --- a/src/rank/ranked_stream.rs +++ b/src/rank/ranked_stream.rs @@ -11,9 +11,9 @@ use fst::Streamer; use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::rank::criterion::{self, Criterion}; use crate::rank::distinct_map::DistinctMap; +use crate::database::retrieve_data_index; use crate::blob::PositiveBlob; use crate::{Match, DocumentId}; -use crate::retrieve::Retrieve; use crate::rank::Document; fn clamp_range(range: Range, big: Range) -> Range { @@ -48,7 +48,7 @@ impl QueryBuilder where T: Deref, { pub fn with_criteria(snapshot: Snapshot, criteria: Vec) -> Result> { - let blob = snapshot.data_index()?; + let blob = retrieve_data_index(&snapshot)?; Ok(QueryBuilder { snapshot, blob, criteria }) } diff --git a/src/retrieve.rs b/src/retrieve.rs deleted file mode 100644 index d9ba3f4a0..000000000 --- a/src/retrieve.rs +++ /dev/null @@ -1,60 +0,0 @@ -use std::error::Error; -use std::ops::Deref; - -use ::rocksdb::rocksdb::{DB, Snapshot, DBVector}; - -use crate::index::schema::{Schema, SchemaAttr}; -use crate::blob::PositiveBlob; -use crate::DocumentId; - -pub struct DocDatabase<'a, R: ?Sized> { - retrieve: &'a R, - schema: Schema, -} - -impl<'a, R> DocDatabase<'a, R> { - pub fn get_document(&self, id: DocumentId) -> Result, Box> { - // if ids.is_empty() { return Ok(Vec::new()) } - unimplemented!() - } - - pub fn get_document_attribute(&self, id: DocumentId, attr: SchemaAttr) -> Result> { - unimplemented!() - } -} - -pub trait Retrieve { - fn schema(&self) -> Result, Box>; - fn data_index(&self) -> Result>; - fn doc_database(&self) -> Result, Box>; -} - -impl Retrieve for Snapshot -where T: Deref, -{ - fn schema(&self) -> Result, Box> { - match self.deref().get(b"data-schema")? { - Some(value) => Ok(Some(Schema::read_from(&*value)?)), - None => Ok(None), - } - } - - fn data_index(&self) -> Result> { - match self.deref().get(b"data-index")? { - Some(value) => Ok(bincode::deserialize(&value)?), - None => Ok(PositiveBlob::default()), - } - } - - fn doc_database(&self) -> Result, Box> { - let schema = match self.schema()? { - Some(schema) => schema, - None => return Err(String::from("BUG: could not find schema").into()), - }; - - Ok(DocDatabase { - retrieve: self, - schema: schema, - }) - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 58ef52d5c..9b075786b 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,52 +1,6 @@ use std::mem; use self::Separator::*; -struct MegaTokenizer { - strings: I, -} - -impl From for MegaTokenizer> { - fn from(string: String) -> Self { - MegaTokenizer { strings: Some(string) } - } -} - -impl From> for MegaTokenizer> { - fn from(strings: Vec) -> Self { - MegaTokenizer { strings } - } -} - -impl Iterator for MegaTokenizer { - type Item = (usize, String); - - fn next(&mut self) -> Option { - unimplemented!() - } -} - -#[test] -fn xxx() { - let s1 = "hello world!"; - let mut s1 = MegaTokenizer::from(s1.to_owned()); - - assert_eq!(s1.next(), Some((0, "hello".into()))); - assert_eq!(s1.next(), Some((1, "world".into()))); - - assert_eq!(s1.next(), None); - - let v1 = vec!["Vin Diesel".to_owned(), "Quentin Tarantino".to_owned()]; - let mut v1 = MegaTokenizer::from(v1); - - assert_eq!(v1.next(), Some((0, "Vin".into()))); - assert_eq!(v1.next(), Some((1, "Diesel".into()))); - - assert_eq!(v1.next(), Some((8, "Quentin".into()))); - assert_eq!(v1.next(), Some((9, "Tarantino".into()))); - - assert_eq!(v1.next(), None); -} - pub trait TokenizerBuilder { fn build<'a>(&self, text: &'a str) -> Box + 'a>; }