Introduce the stop words addition update type

This commit is contained in:
Clément Renault 2019-10-29 15:24:09 +01:00
parent 32d2cc3aea
commit 776673ebae
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 160 additions and 1 deletions

View File

@ -9,6 +9,7 @@ const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map"; const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema"; const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms"; const SYNONYMS_KEY: &str = "synonyms";
const STOP_WORDS_KEY: &str = "stop-words";
const WORDS_KEY: &str = "words"; const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
@ -71,6 +72,24 @@ impl Main {
} }
} }
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
}
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64> pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
where where
F: Fn(u64) -> u64, F: Fn(u64) -> u64,

View File

@ -187,6 +187,14 @@ impl Index {
) )
} }
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
update::StopWordsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> { pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
match self.updates.last_update_id(reader)? { match self.updates.last_update_id(reader)? {
Some((id, _)) => Ok(Some(id)), Some((id, _)) => Ok(Some(id)),

View File

@ -3,6 +3,7 @@ mod customs_update;
mod documents_addition; mod documents_addition;
mod documents_deletion; mod documents_deletion;
mod schema_update; mod schema_update;
mod stop_words_addition;
mod synonyms_addition; mod synonyms_addition;
mod synonyms_deletion; mod synonyms_deletion;
@ -11,11 +12,12 @@ pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition}; pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update}; pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition}; pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion}; pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp; use std::cmp;
use std::collections::BTreeMap; use std::collections::{BTreeMap, BTreeSet};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use heed::Result as ZResult; use heed::Result as ZResult;
@ -34,6 +36,7 @@ pub enum Update {
DocumentsDeletion(Vec<DocumentId>), DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>), SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>), SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
StopWordsAddition(BTreeSet<String>),
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -45,6 +48,7 @@ pub enum UpdateType {
DocumentsDeletion { number: usize }, DocumentsDeletion { number: usize },
SynonymsAddition { number: usize }, SynonymsAddition { number: usize },
SynonymsDeletion { number: usize }, SynonymsDeletion { number: usize },
StopWordsAddition { number: usize },
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -210,6 +214,18 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms); let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
Update::StopWordsAddition(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsAddition {
number: stop_words.len(),
};
let result =
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
} }
}; };

View File

@ -0,0 +1,116 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
stop_words: BTreeSet<String>,
}
impl StopWordsAddition {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> StopWordsAddition {
StopWordsAddition {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_stop_words_addition(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_addition(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsAddition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
addition: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in addition {
stop_words_builder.insert(&word).unwrap();
// we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// we also need to remove all the stop words from the main fst
if let Some(word_fst) = main_store.words_fst(writer)? {
let op = OpBuilder::new()
.add(&word_fst)
.add(&delta_stop_words)
.difference();
let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op).unwrap();
let word_fst = word_fst_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_words_fst(writer, &word_fst)?;
}
// now we add all of these stop words to the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.r#union();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
Ok(())
}