From b3249d515dce8d06eb25829c6e397b4ea82a41a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Thu, 15 Nov 2018 17:55:20 +0100
Subject: [PATCH] feat: Introduce an Index system based on RocksDB

---
 Cargo.toml                          |  12 +-
 examples/csv-indexer.rs             |   6 +-
 examples/json-lines-indexer.rs      |   6 +-
 src/blob/negative_blob.rs           |   2 +-
 src/data/doc_ids.rs                 |   6 +-
 src/index.rs                        |  40 -------
 src/index/blob_name.rs              |  16 +++
 src/index/mod.rs                    | 175 ++++++++++++++++++++++++++++
 src/index/schema.rs                 |  82 +++++++++++++
 src/index/search.rs                 |   5 +
 src/index/update/mod.rs             |  55 +++++++++
 src/index/update/negative_update.rs |  59 ++++++++++
 src/index/update/positive_update.rs | 124 ++++++++++++++++++++
 src/tokenizer/mod.rs                |  78 ++++++-------
 14 files changed, 569 insertions(+), 97 deletions(-)
 delete mode 100644 src/index.rs
 create mode 100644 src/index/blob_name.rs
 create mode 100644 src/index/mod.rs
 create mode 100644 src/index/schema.rs
 create mode 100644 src/index/search.rs
 create mode 100644 src/index/update/mod.rs
 create mode 100644 src/index/update/negative_update.rs
 create mode 100644 src/index/update/positive_update.rs

diff --git a/Cargo.toml b/Cargo.toml
index 40be43633..775a0950b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,10 +6,11 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
 
 [dependencies]
 byteorder = "1.2"
+fnv = "1.0"
+fs2 = "0.4"
 lazy_static = "1.1"
 sdset = "0.2"
-fs2 = "0.4"
-fnv = "1.0"
+unidecode = "0.3"
 
 [dependencies.fst]
 git = "https://github.com/Kerollmops/fst.git"
@@ -27,12 +28,11 @@ git = "https://github.com/pingcap/rust-rocksdb.git"
 git = "https://github.com/Kerollmops/group-by.git"
 
 [dev-dependencies]
+csv = "1.0"
+elapsed = "0.1"
 moby-name-gen = "0.1"
+serde = "1.0"
 serde_derive = "1.0"
 serde_json = "1.0"
 structopt = "0.2"
-unidecode = "0.3"
-elapsed = "0.1"
-serde = "1.0"
 warp = "0.1"
-csv = "1.0"
diff --git a/examples/csv-indexer.rs b/examples/csv-indexer.rs
index c9dc83c8a..b231b7932 100644
--- a/examples/csv-indexer.rs
+++ b/examples/csv-indexer.rs
@@ -79,8 +79,7 @@ impl CsvIndexer {
             }
 
             {
-                let title = Tokenizer::new(&product.title);
-                let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w));
+                let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w));
                 insert_document_words(&mut builder, product.id, 1, title);
 
                 let key = format!("{}-title", product.id);
@@ -89,8 +88,7 @@ impl CsvIndexer {
             }
 
             {
-                let description = Tokenizer::new(&product.description);
-                let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w));
+                let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w));
                 insert_document_words(&mut builder, product.id, 2, description);
 
                 let key = format!("{}-description", product.id);
diff --git a/examples/json-lines-indexer.rs b/examples/json-lines-indexer.rs
index 27314d2af..093c8189b 100644
--- a/examples/json-lines-indexer.rs
+++ b/examples/json-lines-indexer.rs
@@ -84,8 +84,7 @@ impl JsonLinesIndexer {
             }
 
             {
-                let title = Tokenizer::new(&product.title);
-                let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w));
+                let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w));
                 insert_document_words(&mut builder, product.id, 1, title);
 
                 let key = format!("{}-title", product.id);
@@ -94,8 +93,7 @@ impl JsonLinesIndexer {
             }
 
             {
-                let description = Tokenizer::new(&product.description);
-                let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w));
+                let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w));
                 insert_document_words(&mut builder, product.id, 2, description);
 
                 let key = format!("{}-description", product.id);
diff --git a/src/blob/negative_blob.rs b/src/blob/negative_blob.rs
index ca8679c81..0226b5ae5 100644
--- a/src/blob/negative_blob.rs
+++ b/src/blob/negative_blob.rs
@@ -40,7 +40,7 @@ impl<W: Write> NegativeBlobBuilder<W> {
         Self { doc_ids: DocIdsBuilder::new(wrt) }
     }
 
-    pub fn insert(&mut self, doc: DocumentId) {
+    pub fn insert(&mut self, doc: DocumentId) -> bool {
         self.doc_ids.insert(doc)
     }
 
diff --git a/src/data/doc_ids.rs b/src/data/doc_ids.rs
index c2c3738a4..00ae2f024 100644
--- a/src/data/doc_ids.rs
+++ b/src/data/doc_ids.rs
@@ -47,7 +47,7 @@ impl DocIds {
 }
 
 pub struct DocIdsBuilder<W> {
-    doc_ids: BTreeSet<DocumentId>,
+    doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
     wrt: W,
 }
 
@@ -59,8 +59,8 @@ impl<W: io::Write> DocIdsBuilder<W> {
         }
     }
 
-    pub fn insert(&mut self, doc: DocumentId) {
-        self.doc_ids.insert(doc);
+    pub fn insert(&mut self, doc: DocumentId) -> bool {
+        self.doc_ids.insert(doc)
     }
 
     pub fn into_inner(mut self) -> io::Result<W> {
diff --git a/src/index.rs b/src/index.rs
deleted file mode 100644
index 41e0ada03..000000000
--- a/src/index.rs
+++ /dev/null
@@ -1,40 +0,0 @@
-use std::path::{Path, PathBuf};
-use std::error::Error;
-use std::fs::{self, File};
-
-use fs2::FileExt;
-
-use crate::rank::Document;
-use crate::blob::Blob;
-
-pub struct Index {
-    path: PathBuf,
-    lock_file: File,
-    blobs: Vec<Blob>,
-}
-
-impl Index {
-    pub fn open<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> {
-        let path = path.into();
-
-        let lock_file = File::create(path.join(".lock"))?;
-        lock_file.try_lock_exclusive()?;
-
-        let blobs = Vec::new();
-
-        Ok(Self { path, lock_file, blobs })
-    }
-
-    pub fn create<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> {
-        let path = path.into();
-
-        fs::create_dir_all(&path)?;
-        File::create(path.join(".lock"))?;
-
-        Self::open(path)
-    }
-
-    pub fn blobs(&self) -> &[Blob] {
-        &self.blobs
-    }
-}
diff --git a/src/index/blob_name.rs b/src/index/blob_name.rs
new file mode 100644
index 000000000..50bebcaa8
--- /dev/null
+++ b/src/index/blob_name.rs
@@ -0,0 +1,16 @@
+use std::fmt;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct BlobName;
+
+impl BlobName {
+    pub fn new() -> BlobName {
+        unimplemented!()
+    }
+}
+
+impl fmt::Display for BlobName {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        unimplemented!()
+    }
+}
diff --git a/src/index/mod.rs b/src/index/mod.rs
new file mode 100644
index 000000000..d7f642de4
--- /dev/null
+++ b/src/index/mod.rs
@@ -0,0 +1,175 @@
+pub mod blob_name;
+pub mod schema;
+pub mod search;
+pub mod update;
+
+use std::io;
+use std::rc::Rc;
+use std::error::Error;
+use std::fs::{self, File};
+use std::fmt::{self, Write};
+use std::ops::{Deref, BitOr};
+use std::path::{Path, PathBuf};
+use std::collections::{BTreeSet, BTreeMap};
+
+use fs2::FileExt;
+use ::rocksdb::{rocksdb, rocksdb_options};
+use ::rocksdb::merge_operator::MergeOperands;
+
+use crate::rank::Document;
+use crate::data::DocIdsBuilder;
+use crate::{DocIndex, DocumentId};
+use crate::index::{update::Update, search::Search};
+use crate::blob::{PositiveBlobBuilder, Blob, Sign};
+use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer};
+
+fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
+    let mut output = Vec::new();
+    for bytes in operands.chain(value) {
+        output.extend_from_slice(bytes);
+    }
+    output
+}
+
+pub struct Index {
+    database: rocksdb::DB,
+}
+
+impl Index {
+    pub fn open<P: AsRef<Path>>(path: P) -> Result<Index, Box<Error>> {
+        let path = path.as_ref().to_string_lossy();
+
+        let mut opts = rocksdb_options::DBOptions::new();
+        opts.create_if_missing(true);
+
+        let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
+        cf_opts.add_merge_operator("blobs order operator", simple_vec_append);
+
+        let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
+
+        // check if index is a valid RocksDB and
+        // contains the right key-values (i.e. "blobs-order")
+
+        Ok(Self { database })
+    }
+
+    pub fn ingest_update(&self, update: Update) -> Result<(), Box<Error>> {
+        let path = update.into_path_buf();
+        let path = path.to_string_lossy();
+
+        let mut options = rocksdb_options::IngestExternalFileOptions::new();
+        // options.move_files(true);
+
+        let cf_handle = self.database.cf_handle("default").unwrap();
+        self.database.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
+
+        Ok(())
+    }
+
+    pub fn snapshot(&self) -> Snapshot<&rocksdb::DB> {
+        Snapshot::new(&self.database)
+    }
+}
+
+impl Search for Index {
+    fn search(&self, text: &str) -> Vec<Document> {
+        unimplemented!()
+    }
+}
+
+pub struct Snapshot<D>
+where D: Deref<Target=rocksdb::DB>,
+{
+    inner: rocksdb::Snapshot<D>,
+}
+
+impl<D> Snapshot<D>
+where D: Deref<Target=rocksdb::DB>,
+{
+    pub fn new(inner: D) -> Snapshot<D> {
+        Self { inner: rocksdb::Snapshot::new(inner) }
+    }
+}
+
+impl<D> Search for Snapshot<D>
+where D: Deref<Target=rocksdb::DB>,
+{
+    fn search(&self, text: &str) -> Vec<Document> {
+        unimplemented!()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::index::schema::Schema;
+    use crate::index::update::{PositiveUpdateBuilder, NegativeUpdateBuilder};
+
+    #[test]
+    fn generate_negative_update() -> Result<(), Box<Error>> {
+
+        let schema = Schema::open("/meili/default.sch")?;
+        let mut builder = NegativeUpdateBuilder::new("update-delete-0001.sst");
+
+        // you can insert documents in any order, it is sorted internally
+        builder.remove(1);
+        builder.remove(5);
+        builder.remove(2);
+
+        let update = builder.build()?;
+
+        assert_eq!(update.info().sign, Sign::Negative);
+
+        Ok(())
+    }
+
+    #[test]
+    fn generate_positive_update() -> Result<(), Box<Error>> {
+
+        let schema = Schema::open("/meili/default.sch")?;
+        let tokenizer_builder = DefaultBuilder::new();
+        let mut builder = PositiveUpdateBuilder::new("update-positive-0001.sst", schema.clone(), tokenizer_builder);
+
+        // you can insert documents in any order, it is sorted internally
+        let title_field = schema.field("title").unwrap();
+        builder.update_field(1, title_field, "hallo!".to_owned());
+        builder.update_field(5, title_field, "hello!".to_owned());
+        builder.update_field(2, title_field, "hi!".to_owned());
+
+        let name_field = schema.field("name").unwrap();
+        builder.remove_field(4, name_field);
+
+        let update = builder.build()?;
+
+        assert_eq!(update.info().sign, Sign::Positive);
+
+        Ok(())
+    }
+
+    #[test]
+    fn execution() -> Result<(), Box<Error>> {
+
+        let index = Index::open("/meili/data")?;
+        let update = Update::open("update-0001.sst")?;
+        index.ingest_update(update)?;
+        // directly apply changes to the database and see new results
+        let results = index.search("helo");
+
+        //////////////
+
+        let index = Index::open("/meili/data")?;
+        let update = Update::open("update-0001.sst")?;
+
+        // if you create a snapshot before an update
+        let snapshot = index.snapshot();
+        index.ingest_update(update)?;
+
+        // the snapshot does not see the updates
+        let results = snapshot.search("helo");
+
+        // the raw index itself see new results
+        let results = index.search("helo");
+
+        Ok(())
+    }
+}
diff --git a/src/index/schema.rs b/src/index/schema.rs
new file mode 100644
index 000000000..e87459e4d
--- /dev/null
+++ b/src/index/schema.rs
@@ -0,0 +1,82 @@
+use std::error::Error;
+use std::path::Path;
+use std::ops::BitOr;
+use std::fmt;
+
+pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
+pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
+
+#[derive(Copy, Clone)]
+pub struct SchemaProps {
+    stored: bool,
+    indexed: bool,
+}
+
+impl SchemaProps {
+    pub fn is_stored(&self) -> bool {
+        self.stored
+    }
+
+    pub fn is_indexed(&self) -> bool {
+        self.indexed
+    }
+}
+
+impl BitOr for SchemaProps {
+    type Output = Self;
+
+    fn bitor(self, other: Self) -> Self::Output {
+        SchemaProps {
+            stored: self.stored | other.stored,
+            indexed: self.indexed | other.indexed,
+        }
+    }
+}
+
+pub struct SchemaBuilder;
+
+impl SchemaBuilder {
+    pub fn new() -> SchemaBuilder {
+        unimplemented!()
+    }
+
+    pub fn field(&mut self, name: &str, props: SchemaProps) -> SchemaField {
+        unimplemented!()
+    }
+
+    pub fn build(self) -> Schema {
+        unimplemented!()
+    }
+}
+
+#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
+pub struct SchemaField(u32);
+
+impl SchemaField {
+    pub fn as_u32(&self) -> u32 {
+        self.0
+    }
+}
+
+impl fmt::Display for SchemaField {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Clone)]
+pub struct Schema;
+
+impl Schema {
+    pub fn open<P: AsRef<Path>>(path: P) -> Result<Schema, Box<Error>> {
+        unimplemented!()
+    }
+
+    pub fn props(&self, field: SchemaField) -> SchemaProps {
+        unimplemented!()
+    }
+
+    pub fn field(&self, name: &str) -> Option<SchemaField> {
+        unimplemented!()
+    }
+}
diff --git a/src/index/search.rs b/src/index/search.rs
new file mode 100644
index 000000000..712dd1c74
--- /dev/null
+++ b/src/index/search.rs
@@ -0,0 +1,5 @@
+use crate::rank::Document;
+
+pub trait Search {
+    fn search(&self, text: &str) -> Vec<Document>;
+}
diff --git a/src/index/update/mod.rs b/src/index/update/mod.rs
new file mode 100644
index 000000000..cc933ddbc
--- /dev/null
+++ b/src/index/update/mod.rs
@@ -0,0 +1,55 @@
+use std::path::PathBuf;
+use std::error::Error;
+
+use ::rocksdb::rocksdb_options;
+
+use crate::index::blob_name::BlobName;
+use crate::blob::Sign;
+
+mod negative_update;
+mod positive_update;
+
+pub use self::negative_update::{NegativeUpdateBuilder};
+pub use self::positive_update::{PositiveUpdateBuilder, NewState};
+
+// These prefixes are here to make sure the documents fields
+// and the internal data doesn't collide and the internal data are
+// at the top of the sst file.
+const FIELD_BLOBS_ORDER: &str = "00-blobs-order";
+
+pub struct Update {
+    path: PathBuf,
+}
+
+impl Update {
+    pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
+        let path = path.into();
+
+        let env_options = rocksdb_options::EnvOptions::new();
+        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
+        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
+        file_writer.open(&path.to_string_lossy())?;
+        let infos = file_writer.finish()?;
+
+        if infos.smallest_key() != FIELD_BLOBS_ORDER.as_bytes() {
+            // FIXME return a nice error
+            panic!("Invalid update file: the blobs-order field is not the smallest key")
+        }
+
+        Ok(Update { path })
+    }
+
+    pub fn into_path_buf(self) -> PathBuf {
+        self.path
+    }
+
+    pub fn info(&self) -> UpdateInfo {
+        unimplemented!()
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UpdateInfo {
+    pub sign: Sign,
+    pub id: BlobName,
+}
diff --git a/src/index/update/negative_update.rs b/src/index/update/negative_update.rs
new file mode 100644
index 000000000..e90595a5c
--- /dev/null
+++ b/src/index/update/negative_update.rs
@@ -0,0 +1,59 @@
+use std::path::PathBuf;
+use std::error::Error;
+
+use ::rocksdb::rocksdb_options;
+
+use crate::index::update::{FIELD_BLOBS_ORDER, Update};
+use crate::index::blob_name::BlobName;
+use crate::data::DocIdsBuilder;
+use crate::DocumentId;
+
+pub struct NegativeUpdateBuilder {
+    path: PathBuf,
+    doc_ids: DocIdsBuilder<Vec<u8>>,
+}
+
+impl NegativeUpdateBuilder {
+    pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
+        NegativeUpdateBuilder {
+            path: path.into(),
+            doc_ids: DocIdsBuilder::new(Vec::new()),
+        }
+    }
+
+    pub fn remove(&mut self, id: DocumentId) -> bool {
+        self.doc_ids.insert(id)
+    }
+
+    pub fn build(self) -> Result<Update, Box<Error>> {
+        let blob_name = BlobName::new();
+
+        let env_options = rocksdb_options::EnvOptions::new();
+        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
+        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
+
+        file_writer.open(&self.path.to_string_lossy())?;
+
+        // TODO the blob-name must be written in bytes (16 bytes)
+        //      along with the sign
+        unimplemented!("write the blob sign and name");
+
+        // write the blob name to be merged
+        let blob_name = blob_name.to_string();
+        file_writer.merge(FIELD_BLOBS_ORDER.as_bytes(), blob_name.as_bytes())?;
+
+        // write the doc ids
+        let blob_key = format!("0b-{}-doc-ids", blob_name);
+        let blob_doc_ids = self.doc_ids.into_inner()?;
+        file_writer.put(blob_key.as_bytes(), &blob_doc_ids)?;
+
+        for id in blob_doc_ids {
+            let start = format!("5d-{}", id);
+            let end = format!("5d-{}", id + 1);
+            file_writer.delete_range(start.as_bytes(), end.as_bytes())?;
+        }
+
+        file_writer.finish()?;
+        Update::open(self.path)
+    }
+}
diff --git a/src/index/update/positive_update.rs b/src/index/update/positive_update.rs
new file mode 100644
index 000000000..77d24b8a7
--- /dev/null
+++ b/src/index/update/positive_update.rs
@@ -0,0 +1,124 @@
+use std::collections::BTreeMap;
+use std::path::PathBuf;
+use std::error::Error;
+use std::fmt::Write;
+
+use ::rocksdb::rocksdb_options;
+
+use crate::index::schema::{SchemaProps, Schema, SchemaField};
+use crate::index::update::{FIELD_BLOBS_ORDER, Update};
+use crate::tokenizer::TokenizerBuilder;
+use crate::index::blob_name::BlobName;
+use crate::blob::PositiveBlobBuilder;
+use crate::{DocIndex, DocumentId};
+
+pub enum NewState {
+    Updated {
+        value: String,
+        props: SchemaProps,
+    },
+    Removed,
+}
+
+pub struct PositiveUpdateBuilder<B> {
+    path: PathBuf,
+    schema: Schema,
+    tokenizer_builder: B,
+    new_states: BTreeMap<(DocumentId, SchemaField), NewState>,
+}
+
+impl<B> PositiveUpdateBuilder<B> {
+    pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
+        PositiveUpdateBuilder {
+            path: path.into(),
+            schema: schema,
+            tokenizer_builder: tokenizer_builder,
+            new_states: BTreeMap::new(),
+        }
+    }
+
+    // TODO value must be a field that can be indexed
+    pub fn update_field(&mut self, id: DocumentId, field: SchemaField, value: String) {
+        let state = NewState::Updated { value, props: self.schema.props(field) };
+        self.new_states.insert((id, field), state);
+    }
+
+    pub fn remove_field(&mut self, id: DocumentId, field: SchemaField) {
+        self.new_states.insert((id, field), NewState::Removed);
+    }
+}
+
+impl<B> PositiveUpdateBuilder<B>
+where B: TokenizerBuilder
+{
+    pub fn build(self) -> Result<Update, Box<Error>> {
+        let blob_name = BlobName::new();
+
+        let env_options = rocksdb_options::EnvOptions::new();
+        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
+        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
+
+        file_writer.open(&self.path.to_string_lossy())?;
+
+        // TODO the blob-name must be written in bytes (16 bytes)
+        //      along with the sign
+        unimplemented!("write the blob sign and name");
+
+        // write the blob name to be merged
+        let blob_name = blob_name.to_string();
+        file_writer.put(FIELD_BLOBS_ORDER.as_bytes(), blob_name.as_bytes())?;
+
+        let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
+        for ((document_id, field), state) in &self.new_states {
+            let value = match state {
+                NewState::Updated { value, props } if props.is_indexed() => value,
+                _ => continue,
+            };
+
+            for (index, word) in self.tokenizer_builder.build(value) {
+                let doc_index = DocIndex {
+                    document_id: *document_id,
+                    attribute: field.as_u32() as u8,
+                    attribute_index: index as u32,
+                };
+                // insert the exact representation
+                let word_lower = word.to_lowercase();
+
+                // and the unidecoded lowercased version
+                let word_unidecoded = unidecode::unidecode(word).to_lowercase();
+                if word_lower != word_unidecoded {
+                    builder.insert(word_unidecoded, doc_index);
+                }
+
+                builder.insert(word_lower, doc_index);
+            }
+        }
+        let (blob_fst_map, blob_doc_idx) = builder.into_inner()?;
+
+        // write the fst
+        let blob_key = format!("0b-{}-fst", blob_name);
+        file_writer.put(blob_key.as_bytes(), &blob_fst_map)?;
+
+        // write the doc-idx
+        let blob_key = format!("0b-{}-doc-idx", blob_name);
+        file_writer.put(blob_key.as_bytes(), &blob_doc_idx)?;
+
+        // write all the documents fields updates
+        let mut key = String::from("5d-");
+        let prefix_len = key.len();
+
+        for ((id, field), state) in self.new_states {
+            key.truncate(prefix_len);
+            write!(&mut key, "{}-{}", id, field)?;
+            match state {
+                NewState::Updated { value, props } => if props.is_stored() {
+                    file_writer.put(key.as_bytes(), value.as_bytes())?
+                },
+                NewState::Removed => file_writer.delete(key.as_bytes())?,
+            }
+        }
+
+        file_writer.finish()?;
+        Update::open(self.path)
+    }
+}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 1fa8051e5..9b075786b 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -1,28 +1,32 @@
 use std::mem;
 use self::Separator::*;
 
+pub trait TokenizerBuilder {
+    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
+}
+
+pub struct DefaultBuilder;
+
+impl DefaultBuilder {
+    pub fn new() -> DefaultBuilder {
+        DefaultBuilder
+    }
+}
+
+impl TokenizerBuilder for DefaultBuilder {
+    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
+        Box::new(Tokenizer::new(text))
+    }
+}
+
 pub struct Tokenizer<'a> {
+    index: usize,
     inner: &'a str,
 }
 
 impl<'a> Tokenizer<'a> {
     pub fn new(string: &str) -> Tokenizer {
-        Tokenizer { inner: string }
-    }
-
-    pub fn iter(&self) -> Tokens {
-        Tokens::new(self.inner)
-    }
-}
-
-pub struct Tokens<'a> {
-    index: usize,
-    inner: &'a str,
-}
-
-impl<'a> Tokens<'a> {
-    fn new(string: &str) -> Tokens {
-        Tokens {
+        Tokenizer {
             index: 0,
             inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
         }
@@ -52,7 +56,7 @@ impl Separator {
     }
 }
 
-impl<'a> Iterator for Tokens<'a> {
+impl<'a> Iterator for Tokenizer<'a> {
     type Item = (usize, &'a str);
 
     fn next(&mut self) -> Option<Self::Item> {
@@ -101,37 +105,33 @@ mod tests {
 
     #[test]
     fn easy() {
-        let tokenizer = Tokenizer::new("salut");
-        let mut tokens = tokenizer.iter();
+        let mut tokenizer = Tokenizer::new("salut");
 
-        assert_eq!(tokens.next(), Some((0, "salut")));
-        assert_eq!(tokens.next(), None);
+        assert_eq!(tokenizer.next(), Some((0, "salut")));
+        assert_eq!(tokenizer.next(), None);
 
-        let tokenizer = Tokenizer::new("yo    ");
-        let mut tokens = tokenizer.iter();
+        let mut tokenizer = Tokenizer::new("yo    ");
 
-        assert_eq!(tokens.next(), Some((0, "yo")));
-        assert_eq!(tokens.next(), None);
+        assert_eq!(tokenizer.next(), Some((0, "yo")));
+        assert_eq!(tokenizer.next(), None);
     }
 
     #[test]
     fn hard() {
-        let tokenizer = Tokenizer::new(" .? yo lolo. aïe");
-        let mut tokens = tokenizer.iter();
+        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
 
-        assert_eq!(tokens.next(), Some((0, "yo")));
-        assert_eq!(tokens.next(), Some((1, "lolo")));
-        assert_eq!(tokens.next(), Some((9, "aïe")));
-        assert_eq!(tokens.next(), None);
+        assert_eq!(tokenizer.next(), Some((0, "yo")));
+        assert_eq!(tokenizer.next(), Some((1, "lolo")));
+        assert_eq!(tokenizer.next(), Some((9, "aïe")));
+        assert_eq!(tokenizer.next(), None);
 
-        let tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
-        let mut tokens = tokenizer.iter();
+        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
 
-        assert_eq!(tokens.next(), Some((0, "yo")));
-        assert_eq!(tokens.next(), Some((8, "lolo")));
-        assert_eq!(tokens.next(), Some((16, "wtf")));
-        assert_eq!(tokens.next(), Some((24, "lol")));
-        assert_eq!(tokens.next(), Some((32, "aïe")));
-        assert_eq!(tokens.next(), None);
+        assert_eq!(tokenizer.next(), Some((0, "yo")));
+        assert_eq!(tokenizer.next(), Some((8, "lolo")));
+        assert_eq!(tokenizer.next(), Some((16, "wtf")));
+        assert_eq!(tokenizer.next(), Some((24, "lol")));
+        assert_eq!(tokenizer.next(), Some((32, "aïe")));
+        assert_eq!(tokenizer.next(), None);
     }
 }