From 9bbffb8fee9ab73fb59eab731f1e739c85e536dd Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 14:03:31 +0100 Subject: [PATCH] add exact words setting --- milli/src/index.rs | 22 ++++++++++++++++++++++ milli/src/update/settings.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 853e7537d..c0be985da 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -52,6 +52,7 @@ pub mod main_key { pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; + pub const EXACT_WORDS: &str = "exact-words"; } pub mod db_name { @@ -927,6 +928,27 @@ impl Index { self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; Ok(()) } + + /// List the words on which typo are not allowed + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + pub(crate) fn put_exact_words>( + &self, + txn: &mut RwTxn, + words: &fst::Set, + ) -> Result<()> { + self.main.put::<_, Str, ByteSlice>( + txn, + main_key::EXACT_WORDS, + words.as_fst().as_bytes(), + )?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c03d6e0ae..513dee42c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -113,6 +114,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, + exact_words: Setting::NotSet, indexer_config, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, @@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } + pub fn set_exact_words(&mut self, words: Vec) { + self.exact_words = Setting::Set(words); + } + + pub fn reset_exact_words(&mut self) { + self.exact_words = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -526,6 +536,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_exact_words(&mut self) -> Result<()> { + match self.exact_words { + Setting::Set(ref mut words) => { + words.sort_unstable(); + let words = fst::Set::from_iter(words)?; + self.index.put_exact_words(&mut self.wtxn, &words)?; + } + Setting::Reset => { + self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -543,6 +569,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_primary_key()?; self.update_authorize_typos()?; self.update_min_typo_word_len()?; + self.update_exact_words()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute,