From 3dc057ca9cb679bd5354c3ebf9248fb330ce5e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 18:27:57 +0200 Subject: [PATCH] feat: Introduce the new Index system --- meilidb-core/Cargo.toml | 1 + meilidb-core/src/index.rs | 227 ++++++++++++------------------ meilidb-core/src/lib.rs | 8 +- meilidb-core/src/query_builder.rs | 32 +++-- meilidb-data/Cargo.toml | 7 +- meilidb-data/src/database.rs | 61 ++++---- meilidb-data/src/index_event.rs | 16 +-- meilidb-data/src/indexer.rs | 85 +++++++++-- 8 files changed, 235 insertions(+), 202 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 16bc204d4..88363cbad 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -14,6 +14,7 @@ rayon = "1.0.3" sdset = "0.3.1" serde = { version = "1.0.88", features = ["derive"] } slice-group-by = "0.2.4" +zerocopy = "0.2.2" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" diff --git a/meilidb-core/src/index.rs b/meilidb-core/src/index.rs index 44a76d359..87b59e682 100644 --- a/meilidb-core/src/index.rs +++ b/meilidb-core/src/index.rs @@ -1,175 +1,134 @@ -use std::error::Error; - -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use fst::{map, Map, IntoStreamer, Streamer}; -use fst::raw::Fst; +use std::collections::BTreeMap; +use fst::{set, IntoStreamer, Streamer}; +use sdset::{Set, SetBuf, SetOperation}; use sdset::duo::{Union, DifferenceByKey}; -use sdset::{Set, SetOperation}; +use crate::{DocIndex, DocumentId}; -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::data::{DocIndexes, DocIndexesBuilder}; -use crate::{DocumentId, DocIndex}; +pub type Word = Vec; // TODO should be a smallvec -#[derive(Default)] -pub struct Index { - pub map: Map, - pub indexes: DocIndexes, +pub trait Store: Clone { + type Error: std::error::Error; + + fn get_fst(&self) -> Result; + fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error>; + + fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error>; + fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error>; + fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error>; } -impl Index { - pub fn remove_documents(&self, documents: &Set) -> Index { +pub struct Index { + pub set: fst::Set, + pub store: S, +} + +impl Index +where S: Store, +{ + pub fn from_store(store: S) -> Result, S::Error> { + let set = store.get_fst()?; + Ok(Index { set, store }) + } + + pub fn remove_documents(&self, documents: &Set) -> Result, S::Error> { let mut buffer = Vec::new(); - let mut builder = IndexBuilder::new(); + let mut builder = fst::SetBuilder::memory(); let mut stream = self.into_stream(); - while let Some((key, indexes)) = stream.next() { - buffer.clear(); + while let Some((input, result)) = stream.next() { + let indexes = match result? { + Some(indexes) => indexes, + None => continue, + }; - let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x); + let op = DifferenceByKey::new(&indexes, documents, |x| x.document_id, |x| *x); + buffer.clear(); op.extend_vec(&mut buffer); - if !buffer.is_empty() { + if buffer.is_empty() { + self.store.del_indexes(input)?; + } else { + builder.insert(input).unwrap(); let indexes = Set::new_unchecked(&buffer); - builder.insert(key, indexes).unwrap(); + self.store.set_indexes(input, indexes)?; } } - builder.build() + let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); + self.store.set_fst(&set)?; + + Ok(Index { set, store: self.store.clone() }) } - pub fn union(&self, other: &Index) -> Index { - let mut builder = IndexBuilder::new(); - let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union(); - + pub fn insert_indexes(&self, map: BTreeMap>) -> Result, S::Error> { let mut buffer = Vec::new(); - while let Some((key, ivalues)) = stream.next() { - buffer.clear(); - match ivalues { - [a, b] => { - let indexes = if a.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[a.value as usize]; - let a = Set::new_unchecked(indexes); + let mut builder = fst::SetBuilder::memory(); + let set = fst::Set::from_iter(map.keys()).unwrap(); + let mut union_ = self.set.op().add(&set).r#union(); - let indexes = if b.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[b.value as usize]; - let b = Set::new_unchecked(indexes); + while let Some(input) = union_.next() { + let remote = self.store.get_indexes(input)?; + let locale = map.get(input); - let op = Union::new(a, b); - op.extend_vec(&mut buffer); + match (remote, locale) { + (Some(remote), Some(locale)) => { + buffer.clear(); + Union::new(&remote, &locale).extend_vec(&mut buffer); + let indexes = Set::new_unchecked(&buffer); + + if !indexes.is_empty() { + self.store.set_indexes(input, indexes)?; + builder.insert(input).unwrap(); + } else { + self.store.del_indexes(input)?; + } }, - [x] => { - let indexes = if x.index == 0 { &self.indexes } else { &other.indexes }; - let indexes = &indexes[x.value as usize]; - buffer.extend_from_slice(indexes) + (None, Some(locale)) => { + self.store.set_indexes(input, &locale)?; + builder.insert(input).unwrap(); }, - _ => continue, - } - - if !buffer.is_empty() { - let indexes = Set::new_unchecked(&buffer); - builder.insert(key, indexes).unwrap(); + (Some(_), None) => { + builder.insert(input).unwrap(); + }, + (None, None) => unreachable!(), } } - builder.build() + let set = builder.into_inner().and_then(fst::Set::from_bytes).unwrap(); + self.store.set_fst(&set)?; + + Ok(Index { set, store: self.store.clone() }) } } -impl FromSharedDataCursor for Index { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - let len = cursor.read_u64::()? as usize; - let data = cursor.extract(len); - - let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?; - let map = Map::from(fst); - - let indexes = DocIndexes::from_shared_data_cursor(cursor)?; - - Ok(Index { map, indexes}) - } +pub struct Stream<'m, S> { + set_stream: set::Stream<'m>, + store: &'m S, } -impl WriteToBytes for Index { - fn write_to_bytes(&self, bytes: &mut Vec) { - let slice = self.map.as_fst().as_bytes(); - let len = slice.len() as u64; - let _ = bytes.write_u64::(len); - bytes.extend_from_slice(slice); - - self.indexes.write_to_bytes(bytes); - } -} - -impl<'m, 'a> IntoStreamer<'a> for &'m Index { - type Item = (&'a [u8], &'a Set); - type Into = Stream<'m>; - - fn into_stream(self) -> Self::Into { - Stream { - map_stream: self.map.into_stream(), - indexes: &self.indexes, - } - } -} - -pub struct Stream<'m> { - map_stream: map::Stream<'m>, - indexes: &'m DocIndexes, -} - -impl<'m, 'a> Streamer<'a> for Stream<'m> { - type Item = (&'a [u8], &'a Set); +impl<'m, 'a, S> Streamer<'a> for Stream<'m, S> +where S: 'a + Store, +{ + type Item = (&'a [u8], Result>, S::Error>); fn next(&'a mut self) -> Option { - match self.map_stream.next() { - Some((input, index)) => { - let indexes = &self.indexes[index as usize]; - let indexes = Set::new_unchecked(indexes); - Some((input, indexes)) - }, + match self.set_stream.next() { + Some(input) => Some((input, self.store.get_indexes(input))), None => None, } } } -pub struct IndexBuilder { - map: fst::MapBuilder>, - indexes: DocIndexesBuilder>, - value: u64, -} +impl<'m, 'a, S> IntoStreamer<'a> for &'m Index +where S: 'a + Store, +{ + type Item = (&'a [u8], Result>, S::Error>); + type Into = Stream<'m, S>; -impl IndexBuilder { - pub fn new() -> Self { - IndexBuilder { - map: fst::MapBuilder::memory(), - indexes: DocIndexesBuilder::memory(), - value: 0, + fn into_stream(self) -> Self::Into { + Stream { + set_stream: self.set.into_stream(), + store: &self.store, } } - - /// If a key is inserted that is less than or equal to any previous key added, - /// then an error is returned. Similarly, if there was a problem writing - /// to the underlying writer, an error is returned. - // FIXME what if one write doesn't work but the other do ? - pub fn insert(&mut self, key: K, indexes: &Set) -> fst::Result<()> - where K: AsRef<[u8]>, - { - self.map.insert(key, self.value)?; - self.indexes.insert(indexes); - self.value += 1; - Ok(()) - } - - pub fn build(self) -> Index { - let map = self.map.into_inner().unwrap(); - let indexes = self.indexes.into_inner().unwrap(); - - let map = Map::from_bytes(map).unwrap(); - let indexes = DocIndexes::from_bytes(indexes).unwrap(); - - Index { map, indexes } - } } diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 18e9a99cc..5bff103cd 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -13,16 +13,19 @@ use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; +use zerocopy::{AsBytes, FromBytes}; -pub use self::index::{Index, IndexBuilder}; +pub use self::index::{Index, Store}; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. /// Helpful for custom ranking. -#[derive(Serialize, Deserialize)] #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[derive(Serialize, Deserialize)] +#[derive(AsBytes, FromBytes)] +#[repr(C)] pub struct DocumentId(pub u64); /// This structure represent the position of a word @@ -31,6 +34,7 @@ pub struct DocumentId(pub u64); /// This is stored in the map, generated at index time, /// extracted and interpreted at search time. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(AsBytes, FromBytes)] #[repr(C)] pub struct DocIndex { /// The document identifier where the word was found. diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index ad7de9c15..40e172403 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -14,8 +14,8 @@ use log::info; use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; -use crate::{raw_documents_from_matches, RawDocument, Document}; -use crate::{Index, Match, DocumentId}; +use crate::raw_documents_from_matches; +use crate::{Match, DocumentId, Index, Store, RawDocument, Document}; fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -82,16 +82,18 @@ impl<'c, I, FI> QueryBuilder<'c, I, FI> } } -impl<'c, I, FI> QueryBuilder<'c, I, FI> -where I: Deref, +impl<'c, I, FI, S> QueryBuilder<'c, I, FI> +where I: Deref>, + S: Store, { fn query_all(&self, query: &str) -> Vec { let automatons = generate_automatons(query); + let fst = self.index.set.as_fst(); let mut stream = { - let mut op_builder = fst::map::OpBuilder::new(); + let mut op_builder = fst::raw::OpBuilder::new(); for automaton in &automatons { - let stream = self.index.map.search(automaton); + let stream = fst.search(automaton); op_builder.push(stream); } op_builder.r#union() @@ -105,10 +107,12 @@ where I: Deref, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - let doc_indexes = &self.index.indexes; - let doc_indexes = &doc_indexes[iv.value as usize]; + // let doc_indexes = &self.index.indexes; + // let doc_indexes = &doc_indexes[iv.value as usize]; - for di in doc_indexes { + let doc_indexes = self.index.store.get_indexes(input).unwrap().unwrap(); + + for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { query_index: iv.index as u32, @@ -135,9 +139,10 @@ where I: Deref, } } -impl<'c, I, FI> QueryBuilder<'c, I, FI> -where I: Deref, +impl<'c, I, FI, S> QueryBuilder<'c, I, FI> +where I: Deref>, FI: Fn(DocumentId) -> bool, + S: Store, { pub fn query(self, query: &str, range: Range) -> Vec { // We delegate the filter work to the distinct query builder, @@ -212,11 +217,12 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } } -impl<'c, I, FI, FD, K> DistinctQueryBuilder<'c, I, FI, FD> -where I: Deref, +impl<'c, I, FI, FD, K, S> DistinctQueryBuilder<'c, I, FI, FD> +where I: Deref>, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, + S: Store, { pub fn query(self, query: &str, range: Range) -> Vec { let start = Instant::now(); diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 6096e4ad3..c883196d4 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" arc-swap = "0.3.11" bincode = "1.1.2" byteorder = "1.3.1" +deunicode = "1.0.0" hashbrown = { version = "0.2.2", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } @@ -18,8 +19,12 @@ serde = { version = "1.0.90", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.23.0" toml = { version = "0.5.0", features = ["preserve_order"] } -deunicode = "1.0.0" +zerocopy = "0.2.2" [dependencies.rmp-serde] git = "https://github.com/3Hren/msgpack-rust.git" rev = "40b3d48" + +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index da43dd4bc..f66d31aa9 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -12,7 +12,7 @@ use meilidb_core::criterion::Criteria; use meilidb_core::QueryBuilder; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::{DocumentId, Index as WordIndex}; +use meilidb_core::DocumentId; use rmp_serde::decode::{Error as RmpError}; use sdset::SetBuf; use serde::de; @@ -20,7 +20,9 @@ use sled::IVec; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; -use crate::indexer::Indexer; +use crate::indexer::{Indexer, WordIndexTree}; + +pub type WordIndex = meilidb_core::Index; #[derive(Debug)] pub enum Error { @@ -72,6 +74,10 @@ fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } +fn word_index_name(name: &str) -> Vec { + format!("word-index-{}", name).into_bytes() +} + fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { let DocumentId(document_id) = id; let SchemaAttr(schema_attr) = attr; @@ -136,7 +142,8 @@ impl Database { let raw_name = index_name(name); if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { let tree = self.inner.open_tree(raw_name)?; - let raw_index = RawIndex::from_raw(tree)?; + let word_index_tree = self.inner.open_tree(word_index_name(name))?; + let raw_index = RawIndex::from_raw(tree, word_index_tree)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); @@ -162,7 +169,8 @@ impl Database { None => { let raw_name = index_name(&name); let tree = self.inner.open_tree(raw_name)?; - let raw_index = RawIndex::new_from_raw(tree, schema)?; + let word_index_tree = self.inner.open_tree(word_index_name(&name))?; + let raw_index = RawIndex::new_from_raw(tree, word_index_tree, schema)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); @@ -185,25 +193,16 @@ pub struct RawIndex { } impl RawIndex { - fn from_raw(inner: Arc) -> Result { + fn from_raw(inner: Arc, word_index: Arc) -> Result { let schema = { let bytes = inner.get("schema")?; let bytes = bytes.ok_or(Error::SchemaMissing)?; Schema::read_from_bin(bytes.as_ref())? }; - let bytes = inner.get("word-index")?; - let bytes = bytes.ok_or(Error::WordIndexMissing)?; - let word_index = { - let len = bytes.len(); - let bytes: Arc<[u8]> = Into::into(bytes); - let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len); - - // TODO must handle this error - let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap(); - - Arc::new(ArcSwap::new(Arc::new(word_index))) - }; + let store = WordIndexTree(word_index); + let word_index = WordIndex::from_store(store)?; + let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); let ranked_map = { let map = match inner.get("ranked-map")? { @@ -217,13 +216,18 @@ impl RawIndex { Ok(RawIndex { schema, word_index, ranked_map, inner }) } - fn new_from_raw(inner: Arc, schema: Schema) -> Result { + fn new_from_raw( + inner: Arc, + word_index: Arc, + schema: Schema, + ) -> Result + { let mut schema_bytes = Vec::new(); schema.write_to_bin(&mut schema_bytes)?; inner.set("schema", schema_bytes)?; - let word_index = WordIndex::default(); - inner.set("word-index", word_index.into_bytes())?; + let store = WordIndexTree(word_index); + let word_index = WordIndex::from_store(store)?; let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); @@ -243,12 +247,8 @@ impl RawIndex { self.ranked_map.lease() } - pub fn update_word_index(&self, word_index: Arc) -> sled::Result<()> { - let data = word_index.into_bytes(); - self.inner.set("word-index", data).map(drop)?; - self.word_index.store(word_index); - - Ok(()) + pub fn update_word_index(&self, word_index: Arc) { + self.word_index.store(word_index) } pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { @@ -417,14 +417,15 @@ impl DocumentsAddition { Ok(()) } + pub fn finalize(self) -> sled::Result<()> { let delta_index = self.indexer.build(); let index = self.inner.word_index(); - let new_index = index.r#union(&delta_index); + let new_index = index.insert_indexes(delta_index)?; let new_index = Arc::from(new_index); - self.inner.update_word_index(new_index)?; + self.inner.update_word_index(new_index); Ok(()) } @@ -454,10 +455,10 @@ impl DocumentsDeletion { let idset = SetBuf::new_unchecked(self.documents); let index = self.inner.word_index(); - let new_index = index.remove_documents(&idset); + let new_index = index.remove_documents(&idset)?; let new_index = Arc::from(new_index); - self.inner.update_word_index(new_index)?; + self.inner.update_word_index(new_index); Ok(()) } diff --git a/meilidb-data/src/index_event.rs b/meilidb-data/src/index_event.rs index 40d54cbf3..3a63295c9 100644 --- a/meilidb-data/src/index_event.rs +++ b/meilidb-data/src/index_event.rs @@ -7,12 +7,12 @@ use meilidb_core::data::DocIds; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -enum NewIndexEvent<'a> { +enum NewIndexEvent<'a, S> { RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a WordIndex), + UpdatedDocuments(&'a WordIndex), } -impl<'a> WriteToBytes for NewIndexEvent<'a> { +impl<'a, S> WriteToBytes for NewIndexEvent<'a, S> { fn write_to_bytes(&self, bytes: &mut Vec) { match self { NewIndexEvent::RemovedDocuments(doc_ids) => { @@ -21,24 +21,24 @@ impl<'a> WriteToBytes for NewIndexEvent<'a> { }, NewIndexEvent::UpdatedDocuments(index) => { let _ = bytes.write_u8(1); - index.write_to_bytes(bytes); + // index.write_to_bytes(bytes); } } } } -enum IndexEvent { +enum IndexEvent { RemovedDocuments(DocIds), - UpdatedDocuments(WordIndex), + UpdatedDocuments(WordIndex), } -impl FromSharedDataCursor for IndexEvent { +impl FromSharedDataCursor for IndexEvent { type Error = Box; fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { match cursor.read_u8()? { 0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments), - 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), + // 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), _ => Err("invalid index event type".into()), } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 3cfd8f722..3405a79de 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -1,14 +1,78 @@ use std::collections::BTreeMap; use std::convert::TryFrom; +use std::sync::Arc; use deunicode::deunicode_with_tofu; -use meilidb_core::{DocumentId, DocIndex}; -use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; +use meilidb_core::{DocumentId, DocIndex, Store}; use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; -use sdset::Set; +use sdset::{Set, SetBuf}; +use sled::Tree; +use zerocopy::{AsBytes, LayoutVerified}; use crate::SchemaAttr; +#[derive(Clone)] +pub struct WordIndexTree(pub Arc); + +impl Store for WordIndexTree { + type Error = sled::Error; + + fn get_fst(&self) -> Result { + match self.0.get("fst")? { + Some(bytes) => { + let bytes: Arc<[u8]> = bytes.into(); + let len = bytes.len(); + let raw = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); + Ok(fst::Set::from(raw)) + }, + None => Ok(fst::Set::default()), + } + } + + fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error> { + let bytes = set.as_fst().to_vec(); + self.0.set("fst", bytes)?; + Ok(()) + } + + fn get_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + match self.0.get(word_bytes)? { + Some(bytes) => { + let layout = LayoutVerified::new_slice(bytes.as_ref()).unwrap(); + let slice = layout.into_slice(); + let setbuf = SetBuf::new_unchecked(slice.to_vec()); + Ok(Some(setbuf)) + }, + None => Ok(None), + } + } + + fn set_indexes(&self, word: &[u8], indexes: &Set) -> Result<(), Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + let slice = indexes.as_slice(); + let bytes = slice.as_bytes(); + + self.0.set(word_bytes, bytes)?; + + Ok(()) + } + + fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error> { + let mut word_bytes = Vec::from("word-"); + word_bytes.extend_from_slice(word); + + self.0.del(word_bytes)?; + + Ok(()) + } + +} + type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { @@ -48,18 +112,11 @@ impl Indexer { } } - pub fn build(self) -> WordIndex { - let mut builder = WordIndexBuilder::new(); - - for (key, mut indexes) in self.indexed { + pub fn build(self) -> BTreeMap> { + self.indexed.into_iter().map(|(word, mut indexes)| { indexes.sort_unstable(); - indexes.dedup(); - - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes).unwrap(); - } - - builder.build() + (word, SetBuf::new_unchecked(indexes)) + }).collect() } }