From b0c1a9504ab0d827a159132a8096932b2ba7891b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 26 Jul 2023 09:33:42 +0200 Subject: [PATCH] ensure the synonyms are updated when the tokenizer settings are changed --- .../tests/settings/tokenizer_customization.rs | 10 ++++++- milli/src/index.rs | 24 ++++++++++++++-- milli/src/search/new/tests/integration.rs | 4 +-- milli/src/search/new/tests/proximity.rs | 4 +-- milli/src/search/new/tests/typo.rs | 4 +-- milli/src/update/settings.rs | 28 +++++++++---------- milli/tests/search/mod.rs | 4 +-- 7 files changed, 52 insertions(+), 26 deletions(-) diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs index 62a1440b2..fc5d8a880 100644 --- a/meilisearch/tests/settings/tokenizer_customization.rs +++ b/meilisearch/tests/settings/tokenizer_customization.rs @@ -232,7 +232,7 @@ async fn advanced_synergies() { let (_response, _code) = index .update_settings(json!({ - "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."], + "dictionary": ["J.R.R.", "J. R. R."], "synonyms": { "J.R.R.": ["jrr", "J. R. R."], "J. R. R.": ["jrr", "J.R.R."], @@ -347,6 +347,14 @@ async fn advanced_synergies() { }) .await; + // Only update dictionary, the synonyms should be recomputed. + let (_response, _code) = index + .update_settings(json!({ + "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."], + })) + .await; + index.wait_task(2).await; + index .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| { snapshot!(code, @"200 OK"); diff --git a/milli/src/index.rs b/milli/src/index.rs index 77acd4cb8..e1314896b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::File; use std::mem::size_of; use std::path::Path; @@ -65,6 +65,7 @@ pub mod main_key { pub const DICTIONARY_KEY: &str = "dictionary"; pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; + pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; pub const CREATED_AT_KEY: &str = "created-at"; @@ -1138,12 +1139,29 @@ impl Index { &self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>, + user_defined_synonyms: &BTreeMap>, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?; + self.main.put::<_, Str, SerdeBincode<_>>( + wtxn, + main_key::USER_DEFINED_SYNONYMS_KEY, + user_defined_synonyms, + ) } pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?; + self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY) + } + + pub fn user_defined_synonyms( + &self, + rtxn: &RoTxn, + ) -> heed::Result>> { + Ok(self + .main + .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)? + .unwrap_or_default()) } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs index 3abb1878f..e2ea4580e 100644 --- a/milli/src/search/new/tests/integration.rs +++ b/milli/src/search/new/tests/integration.rs @@ -2,7 +2,7 @@ use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; -use maplit::{hashmap, hashset}; +use maplit::{btreemap, hashset}; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; @@ -33,7 +33,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), }); - builder.set_synonyms(hashmap! { + builder.set_synonyms(btreemap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], S("america") => vec![S("the united states")], diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index b54007c6e..f9ad2b57e 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -15,7 +15,7 @@ they store fewer sprximities than the regular word sprximity DB. */ -use std::collections::HashMap; +use std::collections::BTreeMap; use crate::index::tests::TempIndex; use crate::search::new::tests::collect_field_values; @@ -336,7 +336,7 @@ fn test_proximity_split_word() { index .update_settings(|s| { - let mut syns = HashMap::new(); + let mut syns = BTreeMap::new(); syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]); s.set_synonyms(syns); }) diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 4f5e851f5..61d4c4387 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -18,7 +18,7 @@ if `words` doesn't exist before it. 14. Synonyms cost nothing according to the typo ranking rule */ -use std::collections::HashMap; +use std::collections::BTreeMap; use crate::index::tests::TempIndex; use crate::search::new::tests::collect_field_values; @@ -591,7 +591,7 @@ fn test_typo_synonyms() { .update_settings(|s| { s.set_criteria(vec![Criterion::Typo]); - let mut synonyms = HashMap::new(); + let mut synonyms = BTreeMap::new(); synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8f5a71f1d..360fdb474 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; @@ -116,7 +116,7 @@ pub struct Settings<'a, 't, 'u, 'i> { separator_tokens: Setting>, dictionary: Setting>, distinct_field: Setting, - synonyms: Setting>>, + synonyms: Setting>>, primary_key: Setting, authorize_typos: Setting, min_word_len_two_typos: Setting, @@ -256,7 +256,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.synonyms = Setting::Reset; } - pub fn set_synonyms(&mut self, synonyms: HashMap>) { + pub fn set_synonyms(&mut self, synonyms: BTreeMap>) { self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } } @@ -508,8 +508,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // the synonyms must be updated if non separator tokens have been updated. - if changes { - self.update_synonyms()?; + if changes && self.synonyms == Setting::NotSet { + self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?); } Ok(changes) @@ -533,8 +533,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // the synonyms must be updated if separator tokens have been updated. - if changes { - self.update_synonyms()?; + if changes && self.synonyms == Setting::NotSet { + self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?); } Ok(changes) @@ -558,8 +558,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // the synonyms must be updated if dictionary has been updated. - if changes { - self.update_synonyms()?; + if changes && self.synonyms == Setting::NotSet { + self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?); } Ok(changes) @@ -567,7 +567,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> Result { match self.synonyms { - Setting::Set(ref synonyms) => { + Setting::Set(ref user_synonyms) => { fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec { tokenizer .tokenize(text) @@ -604,7 +604,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let tokenizer = builder.build(); let mut new_synonyms = HashMap::new(); - for (word, synonyms) in synonyms { + for (word, synonyms) in user_synonyms { // Normalize both the word and associated synonyms. let normalized_word = normalize(&tokenizer, word); let normalized_synonyms = @@ -625,7 +625,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let old_synonyms = self.index.synonyms(self.wtxn)?; if new_synonyms != old_synonyms { - self.index.put_synonyms(self.wtxn, &new_synonyms)?; + self.index.put_synonyms(self.wtxn, &new_synonyms, &user_synonyms)?; Ok(true) } else { Ok(false) @@ -912,7 +912,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { mod tests { use big_s::S; use heed::types::ByteSlice; - use maplit::{btreeset, hashmap, hashset}; + use maplit::{btreemap, btreeset, hashset}; use super::*; use crate::error::Error; @@ -1378,7 +1378,7 @@ mod tests { // In the same transaction provide some synonyms index .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_synonyms(hashmap! { + settings.set_synonyms(btreemap! { "blini".to_string() => vec!["crepes".to_string()], "super like".to_string() => vec!["love".to_string()], "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7b2d9ad6d..1c68cfff2 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -5,7 +5,7 @@ use std::io::Cursor; use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; -use maplit::{hashmap, hashset}; +use maplit::{btreemap, hashset}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; @@ -51,7 +51,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), }); - builder.set_synonyms(hashmap! { + builder.set_synonyms(btreemap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], S("america") => vec![S("the united states")],