From 9a67db09898db083ebea7a3d3c346d0669151545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 1 Dec 2018 11:35:16 +0100 Subject: [PATCH] test: Add some tests to DocIndexes --- src/blob/positive/blob.rs | 8 +-- src/blob/positive/ops.rs | 3 +- src/data/doc_indexes.rs | 131 +++++++++++--------------------------- src/data/mod.rs | 2 +- src/index/mod.rs | 84 ------------------------ 5 files changed, 43 insertions(+), 185 deletions(-) diff --git a/src/blob/positive/blob.rs b/src/blob/positive/blob.rs index ee143dbef..851f4c686 100644 --- a/src/blob/positive/blob.rs +++ b/src/blob/positive/blob.rs @@ -6,7 +6,7 @@ use std::error::Error; use fst::{map, Map, Streamer, IntoStreamer}; use crate::DocIndex; -use crate::data::{DocIndexes, RawDocIndexesBuilder}; +use crate::data::{DocIndexes, DocIndexesBuilder}; use serde::ser::{Serialize, Serializer, SerializeTuple}; use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; @@ -135,7 +135,7 @@ impl<'de> Deserialize<'de> for PositiveBlob { pub struct PositiveBlobBuilder { map: fst::MapBuilder, - indexes: RawDocIndexesBuilder, + indexes: DocIndexesBuilder, value: u64, } @@ -143,7 +143,7 @@ impl PositiveBlobBuilder, Vec> { pub fn memory() -> Self { PositiveBlobBuilder { map: fst::MapBuilder::memory(), - indexes: RawDocIndexesBuilder::memory(), + indexes: DocIndexesBuilder::memory(), value: 0, } } @@ -153,7 +153,7 @@ impl PositiveBlobBuilder { pub fn new(map: W, indexes: X) -> Result> { Ok(PositiveBlobBuilder { map: fst::MapBuilder::new(map)?, - indexes: RawDocIndexesBuilder::new(indexes), + indexes: DocIndexesBuilder::new(indexes), value: 0, }) } diff --git a/src/blob/positive/ops.rs b/src/blob/positive/ops.rs index 78ae7adbb..2788d0c3c 100644 --- a/src/blob/positive/ops.rs +++ b/src/blob/positive/ops.rs @@ -106,7 +106,8 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> { let mut builder = SdOpBuilder::with_capacity(ivalues.len()); for ivalue in ivalues { - let indexes = self.indexes[ivalue.index].get(ivalue.value).expect(""); + let indexes = self.indexes[ivalue.index].get(ivalue.value); + let indexes = indexes.expect("BUG: could not find document indexes"); let set = Set::new_unchecked(indexes); builder.push(set); } diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index 28c3bde46..78e8ebe73 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -1,4 +1,3 @@ -use std::collections::btree_map::{BTreeMap, Iter, Entry}; use std::slice::from_raw_parts; use std::io::{self, Write}; use std::path::Path; @@ -12,6 +11,7 @@ use serde::ser::{Serialize, Serializer, SerializeTuple}; use crate::DocIndex; use crate::data::Data; +#[derive(Debug)] #[repr(C)] struct Range { start: u64, @@ -43,7 +43,7 @@ impl DocIndexes { fn from_data(data: Data) -> io::Result { let ranges_len_offset = data.len() - mem::size_of::(); let ranges_len = (&data[ranges_len_offset..]).read_u64::()?; - let ranges_len = ranges_len as usize * mem::size_of::(); + let ranges_len = ranges_len as usize; let ranges_offset = ranges_len_offset - ranges_len; let ranges = data.range(ranges_offset, ranges_len); @@ -85,20 +85,20 @@ impl Serialize for DocIndexes { } } -pub struct RawDocIndexesBuilder { +pub struct DocIndexesBuilder { ranges: Vec, wtr: W, } -impl RawDocIndexesBuilder> { +impl DocIndexesBuilder> { pub fn memory() -> Self { - RawDocIndexesBuilder::new(Vec::new()) + DocIndexesBuilder::new(Vec::new()) } } -impl RawDocIndexesBuilder { +impl DocIndexesBuilder { pub fn new(wtr: W) -> Self { - RawDocIndexesBuilder { + DocIndexesBuilder { ranges: Vec::new(), wtr: wtr, } @@ -106,7 +106,7 @@ impl RawDocIndexesBuilder { pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> { let len = indexes.len() as u64; - let start = self.ranges.last().map(|r| r.start).unwrap_or(0); + let start = self.ranges.last().map(|r| r.end).unwrap_or(0); let range = Range { start, end: start + len }; self.ranges.push(range); @@ -132,95 +132,36 @@ impl RawDocIndexesBuilder { } } -pub struct DocIndexesBuilder { - keys: BTreeMap, - indexes: Vec>, - number_docs: usize, - wtr: W, -} - -impl DocIndexesBuilder { - pub fn new(wtr: W) -> Self { - Self { - keys: BTreeMap::new(), - indexes: Vec::new(), - number_docs: 0, - wtr: wtr, - } - } - - pub fn number_doc_indexes(&self) -> usize { - self.number_docs - } - - pub fn insert(&mut self, key: String, value: DocIndex) { - match self.keys.entry(key) { - Entry::Vacant(e) => { - let index = self.indexes.len() as u64; - self.indexes.push(vec![value]); - e.insert(index); - }, - Entry::Occupied(e) => { - let index = *e.get(); - let vec = &mut self.indexes[index as usize]; - vec.push(value); - }, - } - self.number_docs += 1; - } - - pub fn keys(&self) -> Iter { - self.keys.iter() - } - - pub fn finish(self) -> io::Result<()> { - self.into_inner().map(drop) - } - - pub fn into_inner(mut self) -> io::Result { - for vec in &mut self.indexes { - vec.sort_unstable(); - } - - let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); - - // write values first - let slice = unsafe { into_u8_slice(values.as_slice()) }; - self.wtr.write_all(slice)?; - - // write ranges after - let slice = unsafe { into_u8_slice(ranges.as_slice()) }; - self.wtr.write_all(slice)?; - - // write the length of the ranges - let len = ranges.len() as u64; - self.wtr.write_u64::(len)?; - - self.wtr.flush()?; - Ok(self.wtr) - } -} - -fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { - let cap = vecs.len(); - let mut ranges = Vec::with_capacity(cap); - let mut values = Vec::with_capacity(number_docs); - - for v in &vecs { - let len = v.len() as u64; - let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); - - let range = Range { start, end: start + len }; - ranges.push(range); - } - - values.extend(vecs.into_iter().flatten()); - - (ranges, values) -} - unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { let ptr = slice.as_ptr() as *const u8; let len = slice.len() * mem::size_of::(); from_raw_parts(ptr, len) } + +#[cfg(test)] +mod tests { + use super::*; + use std::error::Error; + + #[test] + fn serialize_deserialize() -> Result<(), Box> { + let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 }; + let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 }; + let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 }; + + let mut builder = DocIndexesBuilder::memory(); + + builder.insert(&[a])?; + builder.insert(&[a, b, c])?; + builder.insert(&[a, c])?; + + let bytes = builder.into_inner()?; + let docs = DocIndexes::from_bytes(bytes)?; + + assert_eq!(docs.get(0).unwrap(), &[a]); + assert_eq!(docs.get(1).unwrap(), &[a, b, c]); + assert_eq!(docs.get(2).unwrap(), &[a, c]); + + Ok(()) + } +} diff --git a/src/data/mod.rs b/src/data/mod.rs index bf810de93..b4694493b 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use fst::raw::MmapReadOnly; pub use self::doc_ids::DocIds; -pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder, RawDocIndexesBuilder}; +pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; #[derive(Clone)] enum Data { diff --git a/src/index/mod.rs b/src/index/mod.rs index ea4d13294..39372a53d 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -120,87 +120,3 @@ impl Index { Ok(documents) } } - -#[cfg(test)] -mod tests { - use tempfile::NamedTempFile; - - use super::*; - use crate::index::schema::{Schema, SchemaBuilder, STORED, INDEXED}; - use crate::index::update::{PositiveUpdateBuilder, NegativeUpdateBuilder}; - - #[test] - fn generate_negative_update() -> Result<(), Box> { - let path = NamedTempFile::new()?.into_temp_path(); - let mut builder = NegativeUpdateBuilder::new(&path); - - // you can insert documents in any order, - // it is sorted internally - builder.remove(1); - builder.remove(5); - builder.remove(2); - - let update = builder.build()?; - - assert_eq!(update.info().sign, Sign::Negative); - - Ok(()) - } - - #[test] - fn generate_positive_update() -> Result<(), Box> { - let title; - let description; - let schema = { - let mut builder = SchemaBuilder::new(); - title = builder.new_attribute("title", STORED | INDEXED); - description = builder.new_attribute("description", STORED | INDEXED); - builder.build() - }; - - let sst_path = NamedTempFile::new()?.into_temp_path(); - let tokenizer_builder = DefaultBuilder::new(); - let mut builder = PositiveUpdateBuilder::new(&sst_path, schema.clone(), tokenizer_builder); - - // you can insert documents in any order, - // it is sorted internally - builder.update_field(1, title, "hallo!".to_owned()); - builder.update_field(5, title, "hello!".to_owned()); - builder.update_field(2, title, "hi!".to_owned()); - - builder.remove_field(4, description); - - let update = builder.build()?; - - assert_eq!(update.info().sign, Sign::Positive); - - Ok(()) - } - - #[test] - fn execution() -> Result<(), Box> { - - let index = Index::open("/meili/data")?; - let update = Update::open("update-0001.sst")?; - index.ingest_update(update)?; - // directly apply changes to the database and see new results - let results = index.search("helo"); - - ////////////// - - // let index = Index::open("/meili/data")?; - // let update = Update::open("update-0001.sst")?; - - // // if you create a snapshot before an update - // let snapshot = index.snapshot(); - // index.ingest_update(update)?; - - // // the snapshot does not see the updates - // let results = snapshot.search("helo"); - - // // the raw index itself see new results - // let results = index.search("helo"); - - Ok(()) - } -}