From a226fd23c3f8576b987aca9d31dcd52943f72d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 29 Oct 2019 15:43:26 +0100 Subject: [PATCH] Introduce the stop words deletion update type --- meilidb-core/src/store/mod.rs | 8 ++ meilidb-core/src/update/mod.rs | 23 ++++ .../src/update/stop_words_addition.rs | 2 +- .../src/update/stop_words_deletion.rs | 112 ++++++++++++++++++ 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 meilidb-core/src/update/stop_words_deletion.rs diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index d12ebf8ff..3198f455f 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -195,6 +195,14 @@ impl Index { ) } + pub fn stop_words_deletion(&self) -> update::StopWordsDeletion { + update::StopWordsDeletion::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult> { match self.updates.last_update_id(reader)? { Some((id, _)) => Ok(Some(id)), diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs index ecb76a079..755df56c0 100644 --- a/meilidb-core/src/update/mod.rs +++ b/meilidb-core/src/update/mod.rs @@ -4,6 +4,7 @@ mod documents_addition; mod documents_deletion; mod schema_update; mod stop_words_addition; +mod stop_words_deletion; mod synonyms_addition; mod synonyms_deletion; @@ -13,6 +14,7 @@ pub use self::documents_addition::{apply_documents_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::schema_update::{apply_schema_update, push_schema_update}; pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition}; +pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion}; pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition}; pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion}; @@ -37,6 +39,7 @@ pub enum Update { SynonymsAddition(BTreeMap>), SynonymsDeletion(BTreeMap>>), StopWordsAddition(BTreeSet), + StopWordsDeletion(BTreeSet), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -49,6 +52,7 @@ pub enum UpdateType { SynonymsAddition { number: usize }, SynonymsDeletion { number: usize }, StopWordsAddition { number: usize }, + StopWordsDeletion { number: usize }, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -226,6 +230,25 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult { + let start = Instant::now(); + + let update_type = UpdateType::StopWordsDeletion { + number: stop_words.len(), + }; + + let result = apply_stop_words_deletion( + writer, + index.main, + index.documents_fields, + index.documents_fields_counts, + index.postings_lists, + index.docs_words, + stop_words, + ); + (update_type, result, start.elapsed()) } }; diff --git a/meilidb-core/src/update/stop_words_addition.rs b/meilidb-core/src/update/stop_words_addition.rs index 182394e59..6adba450b 100644 --- a/meilidb-core/src/update/stop_words_addition.rs +++ b/meilidb-core/src/update/stop_words_addition.rs @@ -95,7 +95,7 @@ pub fn apply_stop_words_addition( main_store.put_words_fst(writer, &word_fst)?; } - // now we add all of these stop words to the main store + // now we add all of these stop words from the main store let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); let op = OpBuilder::new() diff --git a/meilidb-core/src/update/stop_words_deletion.rs b/meilidb-core/src/update/stop_words_deletion.rs new file mode 100644 index 000000000..11c72ded9 --- /dev/null +++ b/meilidb-core/src/update/stop_words_deletion.rs @@ -0,0 +1,112 @@ +use std::collections::BTreeSet; + +use fst::{set::OpBuilder, SetBuilder}; + +use crate::automaton::normalize_str; +use crate::update::documents_addition::reindex_all_documents; +use crate::update::{next_update_id, Update}; +use crate::{store, MResult}; + +pub struct StopWordsDeletion { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + stop_words: BTreeSet, +} + +impl StopWordsDeletion { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> StopWordsDeletion { + StopWordsDeletion { + updates_store, + updates_results_store, + updates_notifier, + stop_words: BTreeSet::new(), + } + } + + pub fn delete_stop_word>(&mut self, stop_word: S) { + let stop_word = normalize_str(stop_word.as_ref()); + self.stop_words.insert(stop_word); + } + + pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult { + let _ = self.updates_notifier.send(()); + let update_id = push_stop_words_deletion( + writer, + self.updates_store, + self.updates_results_store, + self.stop_words, + )?; + Ok(update_id) + } +} + +pub fn push_stop_words_deletion( + writer: &mut heed::RwTxn, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + deletion: BTreeSet, +) -> MResult { + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::StopWordsDeletion(deletion); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_stop_words_deletion( + writer: &mut heed::RwTxn, + main_store: store::Main, + documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, + postings_lists_store: store::PostingsLists, + docs_words_store: store::DocsWords, + deletion: BTreeSet, +) -> MResult<()> { + let mut stop_words_builder = SetBuilder::memory(); + + for word in deletion { + stop_words_builder.insert(&word).unwrap(); + } + + // create the new delta stop words fst + let delta_stop_words = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + // now we delete all of these stop words from the main store + let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); + + let op = OpBuilder::new() + .add(&stop_words_fst) + .add(&delta_stop_words) + .difference(); + + let mut stop_words_builder = SetBuilder::memory(); + stop_words_builder.extend_stream(op).unwrap(); + let stop_words_fst = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + main_store.put_stop_words_fst(writer, &stop_words_fst)?; + + // now that we have setup the stop words + // lets reindex everything... + reindex_all_documents( + writer, + main_store, + documents_fields_store, + documents_fields_counts_store, + postings_lists_store, + docs_words_store, + )?; + + Ok(()) +}