From b0c1a9504ab0d827a159132a8096932b2ba7891b Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 26 Jul 2023 09:33:42 +0200
Subject: [PATCH] ensure the synonyms are updated when the tokenizer settings
 are changed

---
 .../tests/settings/tokenizer_customization.rs | 10 ++++++-
 milli/src/index.rs                            | 24 ++++++++++++++--
 milli/src/search/new/tests/integration.rs     |  4 +--
 milli/src/search/new/tests/proximity.rs       |  4 +--
 milli/src/search/new/tests/typo.rs            |  4 +--
 milli/src/update/settings.rs                  | 28 +++++++++----------
 milli/tests/search/mod.rs                     |  4 +--
 7 files changed, 52 insertions(+), 26 deletions(-)
diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs
index 62a1440b2..fc5d8a880 100644
--- a/meilisearch/tests/settings/tokenizer_customization.rs
+++ b/meilisearch/tests/settings/tokenizer_customization.rs
@@ -232,7 +232,7 @@ async fn advanced_synergies() {
 
     let (_response, _code) = index
         .update_settings(json!({
-            "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
+            "dictionary": ["J.R.R.", "J. R. R."],
             "synonyms": {
                 "J.R.R.": ["jrr", "J. R. R."],
                 "J. R. R.": ["jrr", "J.R.R."],
@@ -347,6 +347,14 @@ async fn advanced_synergies() {
         })
         .await;
 
+    // Only update dictionary, the synonyms should be recomputed.
+    let (_response, _code) = index
+        .update_settings(json!({
+            "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
+        }))
+        .await;
+    index.wait_task(2).await;
+
     index
         .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| {
             snapshot!(code, @"200 OK");
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 77acd4cb8..e1314896b 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
@@ -65,6 +65,7 @@ pub mod main_key {
     pub const DICTIONARY_KEY: &str = "dictionary";
     pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
     pub const SYNONYMS_KEY: &str = "synonyms";
+    pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
     pub const WORDS_FST_KEY: &str = "words-fst";
     pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
     pub const CREATED_AT_KEY: &str = "created-at";
@@ -1138,12 +1139,29 @@ impl Index {
         &self,
         wtxn: &mut RwTxn,
         synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
+        user_defined_synonyms: &BTreeMap<String, Vec<String>>,
     ) -> heed::Result<()> {
-        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
+        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?;
+        self.main.put::<_, Str, SerdeBincode<_>>(
+            wtxn,
+            main_key::USER_DEFINED_SYNONYMS_KEY,
+            user_defined_synonyms,
+        )
     }
 
     pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
-        self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)
+        self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?;
+        self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY)
+    }
+
+    pub fn user_defined_synonyms(
+        &self,
+        rtxn: &RoTxn,
+    ) -> heed::Result<BTreeMap<String, Vec<String>>> {
+        Ok(self
+            .main
+            .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)?
+            .unwrap_or_default())
     }
 
     pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs
index 3abb1878f..e2ea4580e 100644
--- a/milli/src/search/new/tests/integration.rs
+++ b/milli/src/search/new/tests/integration.rs
@@ -2,7 +2,7 @@ use std::io::Cursor;
 
 use big_s::S;
 use heed::EnvOpenOptions;
-use maplit::{hashmap, hashset};
+use maplit::{btreemap, hashset};
 
 use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
@@ -33,7 +33,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
         S("tag"),
         S("asc_desc_rank"),
     });
-    builder.set_synonyms(hashmap! {
+    builder.set_synonyms(btreemap! {
         S("hello") => vec![S("good morning")],
         S("world") => vec![S("earth")],
         S("america") => vec![S("the united states")],
diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs
index b54007c6e..f9ad2b57e 100644
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -15,7 +15,7 @@ they store fewer sprximities than the regular word sprximity DB.
 
 */
 
-use std::collections::HashMap;
+use std::collections::BTreeMap;
 
 use crate::index::tests::TempIndex;
 use crate::search::new::tests::collect_field_values;
@@ -336,7 +336,7 @@ fn test_proximity_split_word() {
 
     index
         .update_settings(|s| {
-            let mut syns = HashMap::new();
+            let mut syns = BTreeMap::new();
             syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
             s.set_synonyms(syns);
         })
diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs
index 4f5e851f5..61d4c4387 100644
--- a/milli/src/search/new/tests/typo.rs
+++ b/milli/src/search/new/tests/typo.rs
@@ -18,7 +18,7 @@ if `words` doesn't exist before it.
 14. Synonyms cost nothing according to the typo ranking rule
 */
 
-use std::collections::HashMap;
+use std::collections::BTreeMap;
 
 use crate::index::tests::TempIndex;
 use crate::search::new::tests::collect_field_values;
@@ -591,7 +591,7 @@ fn test_typo_synonyms() {
         .update_settings(|s| {
             s.set_criteria(vec![Criterion::Typo]);
 
-            let mut synonyms = HashMap::new();
+            let mut synonyms = BTreeMap::new();
             synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
             synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);
 
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 8f5a71f1d..360fdb474 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::result::Result as StdResult;
 
 use charabia::{Normalize, Tokenizer, TokenizerBuilder};
@@ -116,7 +116,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
     separator_tokens: Setting<BTreeSet<String>>,
     dictionary: Setting<BTreeSet<String>>,
     distinct_field: Setting<String>,
-    synonyms: Setting<HashMap<String, Vec<String>>>,
+    synonyms: Setting<BTreeMap<String, Vec<String>>>,
     primary_key: Setting<String>,
     authorize_typos: Setting<bool>,
     min_word_len_two_typos: Setting<u8>,
@@ -256,7 +256,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         self.synonyms = Setting::Reset;
     }
 
-    pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
+    pub fn set_synonyms(&mut self, synonyms: BTreeMap<String, Vec<String>>) {
         self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
     }
 
@@ -508,8 +508,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         };
 
         // the synonyms must be updated if non separator tokens have been updated.
-        if changes {
-            self.update_synonyms()?;
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
         }
 
         Ok(changes)
@@ -533,8 +533,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         };
 
         // the synonyms must be updated if separator tokens have been updated.
-        if changes {
-            self.update_synonyms()?;
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
         }
 
         Ok(changes)
@@ -558,8 +558,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         };
 
         // the synonyms must be updated if dictionary has been updated.
-        if changes {
-            self.update_synonyms()?;
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
         }
 
         Ok(changes)
@@ -567,7 +567,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
 
     fn update_synonyms(&mut self) -> Result<bool> {
         match self.synonyms {
-            Setting::Set(ref synonyms) => {
+            Setting::Set(ref user_synonyms) => {
                 fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
                     tokenizer
                         .tokenize(text)
@@ -604,7 +604,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                 let tokenizer = builder.build();
 
                 let mut new_synonyms = HashMap::new();
-                for (word, synonyms) in synonyms {
+                for (word, synonyms) in user_synonyms {
                     // Normalize both the word and associated synonyms.
                     let normalized_word = normalize(&tokenizer, word);
                     let normalized_synonyms =
@@ -625,7 +625,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                 let old_synonyms = self.index.synonyms(self.wtxn)?;
 
                 if new_synonyms != old_synonyms {
-                    self.index.put_synonyms(self.wtxn, &new_synonyms)?;
+                    self.index.put_synonyms(self.wtxn, &new_synonyms, &user_synonyms)?;
                     Ok(true)
                 } else {
                     Ok(false)
@@ -912,7 +912,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
 mod tests {
     use big_s::S;
     use heed::types::ByteSlice;
-    use maplit::{btreeset, hashmap, hashset};
+    use maplit::{btreemap, btreeset, hashset};
 
     use super::*;
     use crate::error::Error;
@@ -1378,7 +1378,7 @@ mod tests {
         // In the same transaction provide some synonyms
         index
             .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_synonyms(hashmap! {
+                settings.set_synonyms(btreemap! {
                     "blini".to_string() => vec!["crepes".to_string()],
                     "super like".to_string() => vec!["love".to_string()],
                     "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 7b2d9ad6d..1c68cfff2 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -5,7 +5,7 @@ use std::io::Cursor;
 use big_s::S;
 use either::{Either, Left, Right};
 use heed::EnvOpenOptions;
-use maplit::{hashmap, hashset};
+use maplit::{btreemap, hashset};
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
@@ -51,7 +51,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
         S("tag"),
         S("asc_desc_rank"),
     });
-    builder.set_synonyms(hashmap! {
+    builder.set_synonyms(btreemap! {
         S("hello") => vec![S("good morning")],
         S("world") => vec![S("earth")],
         S("america") => vec![S("the united states")],