From a76c00a787d706f42c17de01ffea53bc3e4643ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 16:49:31 +0200 Subject: [PATCH] feat: Create types to edit synonyms and keep them in the database --- .../src/database/documents_addition.rs | 3 +- .../src/database/documents_deletion.rs | 3 +- meilidb-data/src/database/index.rs | 30 ++++++- meilidb-data/src/database/main_index.rs | 16 ++++ meilidb-data/src/database/mod.rs | 22 ++++- meilidb-data/src/database/raw_index.rs | 4 +- .../src/database/synonyms_addition.rs | 83 +++++++++++++++++++ .../src/database/synonyms_deletion.rs | 71 ++++++++++++++++ meilidb-data/src/database/synonyms_index.rs | 23 +++++ 9 files changed, 248 insertions(+), 7 deletions(-) create mode 100644 meilidb-data/src/database/synonyms_addition.rs create mode 100644 meilidb-data/src/database/synonyms_deletion.rs create mode 100644 meilidb-data/src/database/synonyms_index.rs diff --git a/meilidb-data/src/database/documents_addition.rs b/meilidb-data/src/database/documents_addition.rs index 177d1975c..15323be70 100644 --- a/meilidb-data/src/database/documents_addition.rs +++ b/meilidb-data/src/database/documents_addition.rs @@ -120,11 +120,12 @@ impl<'a> DocumentsAddition<'a> { // update the "consistent" view of the Index let ranked_map = self.ranked_map; + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs index e89923199..9813afe3c 100644 --- a/meilidb-data/src/database/documents_deletion.rs +++ b/meilidb-data/src/database/documents_deletion.rs @@ -119,11 +119,12 @@ impl<'a> DocumentsDeletion<'a> { // update the "consistent" view of the Index let ranked_map = lease_inner.ranked_map.clone(); + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs index 4cc6d7acb..886d31118 100644 --- a/meilidb-data/src/database/index.rs +++ b/meilidb-data/src/database/index.rs @@ -13,7 +13,11 @@ use crate::ranked_map::RankedMap; use crate::serde::Deserializer; use super::{Error, CustomSettings}; -use super::{RawIndex, DocumentsAddition, DocumentsDeletion}; +use super::{ + RawIndex, + DocumentsAddition, DocumentsDeletion, + SynonymsAddition, SynonymsDeletion, +}; #[derive(Copy, Clone)] pub struct IndexStats { @@ -27,6 +31,7 @@ pub struct Index(pub ArcSwap); pub struct InnerIndex { pub words: fst::Set, + pub synonyms: fst::Set, pub schema: Schema, pub ranked_map: RankedMap, pub raw: RawIndex, // TODO this will be a snapshot in the future @@ -39,6 +44,11 @@ impl Index { None => fst::Set::default(), }; + let synonyms = match raw.main.synonyms_set()? { + Some(synonyms) => synonyms, + None => fst::Set::default(), + }; + let schema = match raw.main.schema()? { Some(schema) => schema, None => return Err(Error::SchemaMissing), @@ -49,7 +59,7 @@ impl Index { None => RankedMap::default(), }; - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; let index = Index(ArcSwap::new(Arc::new(inner))); Ok(index) @@ -101,6 +111,14 @@ impl Index { DocumentsDeletion::new(self, ranked_map) } + pub fn synonyms_addition(&self) -> SynonymsAddition { + SynonymsAddition::new(self) + } + + pub fn synonyms_deletion(&self) -> SynonymsDeletion { + SynonymsDeletion::new(self) + } + pub fn document( &self, fields: Option<&HashSet<&str>>, @@ -141,4 +159,12 @@ impl Store for IndexLease { fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { Ok(self.0.raw.words.doc_indexes(word)?) } + + fn synonyms(&self) -> Result<&fst::Set, Self::Error> { + Ok(&self.0.synonyms) + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + Ok(self.0.raw.synonyms.alternatives_to(word)?) + } } diff --git a/meilidb-data/src/database/main_index.rs b/meilidb-data/src/database/main_index.rs index 7b3b98479..d7d4e1fbd 100644 --- a/meilidb-data/src/database/main_index.rs +++ b/meilidb-data/src/database/main_index.rs @@ -44,6 +44,22 @@ impl MainIndex { self.0.set("words", value.as_fst().as_bytes()).map_err(Into::into) } + pub fn synonyms_set(&self) -> Result, Error> { + match self.0.get_pinned("synonyms")? { + Some(bytes) => { + let len = bytes.len(); + let value = Arc::from(bytes.as_ref()); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> { + self.0.set("synonyms", value.as_fst().as_bytes()).map_err(Into::into) + } + pub fn ranked_map(&self) -> Result, Error> { match self.0.get_pinned("ranked-map")? { Some(bytes) => { diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs index b9df6fc0b..2edf774e0 100644 --- a/meilidb-data/src/database/mod.rs +++ b/meilidb-data/src/database/mod.rs @@ -13,6 +13,9 @@ mod error; mod index; mod main_index; mod raw_index; +mod synonyms_addition; +mod synonyms_deletion; +mod synonyms_index; mod words_index; pub use self::error::Error; @@ -22,11 +25,14 @@ pub use self::custom_settings::CustomSettings; use self::docs_words_index::DocsWordsIndex; use self::documents_addition::DocumentsAddition; use self::documents_deletion::DocumentsDeletion; +use self::synonyms_addition::SynonymsAddition; +use self::synonyms_deletion::SynonymsDeletion; use self::documents_index::DocumentsIndex; use self::index::InnerIndex; use self::main_index::MainIndex; use self::raw_index::{RawIndex, InnerRawIndex}; use self::words_index::WordsIndex; +use self::synonyms_index::SynonymsIndex; pub struct Database { cache: RwLock>>, @@ -99,6 +105,12 @@ impl Database { MainIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(name))) }; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.cf_handle(&cf_name).expect("cf not found"); + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.cf_handle(&cf_name).expect("cf not found"); @@ -123,7 +135,7 @@ impl Database { CustomSettings(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) }; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -154,6 +166,12 @@ impl Database { main.set_schema(&schema)?; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; @@ -182,7 +200,7 @@ impl Database { indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() diff --git a/meilidb-data/src/database/raw_index.rs b/meilidb-data/src/database/raw_index.rs index 8c129ac2d..612fb0df1 100644 --- a/meilidb-data/src/database/raw_index.rs +++ b/meilidb-data/src/database/raw_index.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; +use super::{MainIndex, SynonymsIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; #[derive(Clone)] pub struct RawIndex { pub main: MainIndex, + pub synonyms: SynonymsIndex, pub words: WordsIndex, pub docs_words: DocsWordsIndex, pub documents: DocumentsIndex, @@ -13,6 +14,7 @@ pub struct RawIndex { impl RawIndex { pub(crate) fn compact(&self) { self.main.0.compact_range(None::<&[u8]>, None::<&[u8]>); + self.synonyms.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.docs_words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.documents.0.compact_range(None::<&[u8]>, None::<&[u8]>); diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs new file mode 100644 index 000000000..755c11710 --- /dev/null +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -0,0 +1,83 @@ +use std::collections::BTreeMap; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; +use sdset::SetBuf; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsAddition<'a> { + inner: &'a Index, + synonyms: BTreeMap>, +} + +impl<'a> SynonymsAddition<'a> { + pub fn new(inner: &'a Index) -> SynonymsAddition<'a> { + SynonymsAddition { inner, synonyms: BTreeMap::new() } + } + + pub fn add_synonym(&mut self, synonym: String, alternatives: I) + where I: Iterator, + { + self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut synonyms_builder = SetBuilder::memory(); + + for (synonym, mut alternatives) in self.synonyms { + synonyms_builder.insert(&synonym).unwrap(); + + let alternatives = { + alternatives.iter_mut().for_each(|s| *s = s.to_lowercase()); + let alternatives = SetBuf::from_dirty(alternatives); + + let mut alternatives_builder = SetBuilder::memory(); + alternatives_builder.extend_iter(alternatives).unwrap(); + alternatives_builder.into_inner().unwrap() + }; + synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .r#union(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_synonyms, + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone();; + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_deletion.rs b/meilidb-data/src/database/synonyms_deletion.rs new file mode 100644 index 000000000..8720d4b5c --- /dev/null +++ b/meilidb-data/src/database/synonyms_deletion.rs @@ -0,0 +1,71 @@ +use std::collections::BTreeSet; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsDeletion<'a> { + inner: &'a Index, + synonyms: BTreeSet, +} + +impl<'a> SynonymsDeletion<'a> { + pub fn new(inner: &'a Index) -> SynonymsDeletion<'a> { + SynonymsDeletion { inner, synonyms: BTreeSet::new() } + } + + pub fn delete_alternatives_of(&mut self, synonym: String) { + self.synonyms.insert(synonym); + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut synonyms_builder = SetBuilder::memory(); + + for synonym in self.synonyms { + synonyms_builder.insert(&synonym).unwrap(); + synonyms.del_alternatives_of(synonym.as_bytes())?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .difference(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => fst::Set::default(), + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone(); + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_index.rs b/meilidb-data/src/database/synonyms_index.rs new file mode 100644 index 000000000..dfc0182e4 --- /dev/null +++ b/meilidb-data/src/database/synonyms_index.rs @@ -0,0 +1,23 @@ +use crate::database::raw_index::InnerRawIndex; + +#[derive(Clone)] +pub struct SynonymsIndex(pub(crate) InnerRawIndex); + +impl SynonymsIndex { + pub fn alternatives_to(&self, word: &[u8]) -> Result, rocksdb::Error> { + match self.0.get(word)? { + Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())), + None => Ok(None), + } + } + + pub fn set_alternatives_to(&self, word: &[u8], value: Vec) -> Result<(), rocksdb::Error> { + self.0.set(word, value)?; + Ok(()) + } + + pub fn del_alternatives_of(&self, word: &[u8]) -> Result<(), rocksdb::Error> { + self.0.delete(word)?; + Ok(()) + } +}