From d57026cd969e5b41621d6f4a592bd98b994eb0fd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 25 Jul 2023 15:01:42 +0200 Subject: [PATCH] Support synonyms sinergies --- .../tests/settings/tokenizer_customization.rs | 263 ++++++++++++++++++ milli/src/update/settings.rs | 66 ++++- 2 files changed, 314 insertions(+), 15 deletions(-) diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs index 75bea560b..62a1440b2 100644 --- a/meilisearch/tests/settings/tokenizer_customization.rs +++ b/meilisearch/tests/settings/tokenizer_customization.rs @@ -194,3 +194,266 @@ async fn set_and_search() { }) .await; } + +#[actix_rt::test] +async fn advanced_synergies() { + let documents = json!([ + { + "id": 1, + "content": "J.R.R. Tolkien", + }, + { + "id": 2, + "content": "J. R. R. Tolkien", + }, + { + "id": 3, + "content": "jrr Tolkien", + }, + { + "id": 4, + "content": "J.K. Rowlings", + }, + { + "id": 5, + "content": "J. K. Rowlings", + }, + { + "id": 6, + "content": "jk Rowlings", + }, + ]); + + let server = Server::new().await; + let index = server.index("test"); + + index.add_documents(documents, None).await; + index.wait_task(0).await; + + let (_response, _code) = index + .update_settings(json!({ + "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."], + "synonyms": { + "J.R.R.": ["jrr", "J. R. R."], + "J. R. R.": ["jrr", "J.R.R."], + "jrr": ["J.R.R.", "J. R. R."], + "J.K.": ["jk", "J. K."], + "J. K.": ["jk", "J.K."], + "jk": ["J.K.", "J. K."], + } + })) + .await; + index.wait_task(1).await; + + index + .search(json!({"q": "J.R.R.", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "content": "J.R.R. Tolkien", + "_formatted": { + "id": "1", + "content": "J.R.R. Tolkien" + } + }, + { + "id": 2, + "content": "J. R. R. Tolkien", + "_formatted": { + "id": "2", + "content": "J. R. R. Tolkien" + } + }, + { + "id": 3, + "content": "jrr Tolkien", + "_formatted": { + "id": "3", + "content": "jrr Tolkien" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "jrr", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "content": "jrr Tolkien", + "_formatted": { + "id": "3", + "content": "jrr Tolkien" + } + }, + { + "id": 1, + "content": "J.R.R. Tolkien", + "_formatted": { + "id": "1", + "content": "J.R.R. Tolkien" + } + }, + { + "id": 2, + "content": "J. R. R. Tolkien", + "_formatted": { + "id": "2", + "content": "J. R. R. Tolkien" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "J. R. R.", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "content": "J. R. R. Tolkien", + "_formatted": { + "id": "2", + "content": "J. R. R. Tolkien" + } + }, + { + "id": 1, + "content": "J.R.R. Tolkien", + "_formatted": { + "id": "1", + "content": "J.R.R. Tolkien" + } + }, + { + "id": 3, + "content": "jrr Tolkien", + "_formatted": { + "id": "3", + "content": "jrr Tolkien" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 6, + "content": "jk Rowlings", + "_formatted": { + "id": "6", + "content": "jk Rowlings" + } + }, + { + "id": 4, + "content": "J.K. Rowlings", + "_formatted": { + "id": "4", + "content": "J.K. Rowlings" + } + }, + { + "id": 5, + "content": "J. K. Rowlings", + "_formatted": { + "id": "5", + "content": "J. K. Rowlings" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "J.K.", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 4, + "content": "J.K. Rowlings", + "_formatted": { + "id": "4", + "content": "J.K. Rowlings" + } + }, + { + "id": 5, + "content": "J. K. Rowlings", + "_formatted": { + "id": "5", + "content": "J. K. Rowlings" + } + }, + { + "id": 6, + "content": "jk Rowlings", + "_formatted": { + "id": "6", + "content": "jk Rowlings" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "J. K.", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 5, + "content": "J. K. Rowlings", + "_formatted": { + "id": "5", + "content": "J. K. Rowlings" + } + }, + { + "id": 4, + "content": "J.K. Rowlings", + "_formatted": { + "id": "4", + "content": "J.K. Rowlings" + } + }, + { + "id": 6, + "content": "jk Rowlings", + "_formatted": { + "id": "6", + "content": "jk Rowlings" + } + }, + { + "id": 2, + "content": "J. R. R. Tolkien", + "_formatted": { + "id": "2", + "content": "J. R. R. Tolkien" + } + } + ] + "###); + }) + .await; +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index bdae5d7b4..8f5a71f1d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -491,57 +491,78 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_non_separator_tokens(&mut self) -> Result { - match self.non_separator_tokens { + let changes = match self.non_separator_tokens { Setting::Set(ref non_separator_tokens) => { let current = self.index.non_separator_tokens(self.wtxn)?; // Does the new list differ from the previous one? if current.map_or(true, |current| ¤t != non_separator_tokens) { self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?; - Ok(true) + true } else { - Ok(false) + false } } - Setting::Reset => Ok(self.index.delete_non_separator_tokens(self.wtxn)?), - Setting::NotSet => Ok(false), + Setting::Reset => self.index.delete_non_separator_tokens(self.wtxn)?, + Setting::NotSet => false, + }; + + // the synonyms must be updated if non separator tokens have been updated. + if changes { + self.update_synonyms()?; } + + Ok(changes) } fn update_separator_tokens(&mut self) -> Result { - match self.separator_tokens { + let changes = match self.separator_tokens { Setting::Set(ref separator_tokens) => { let current = self.index.separator_tokens(self.wtxn)?; // Does the new list differ from the previous one? if current.map_or(true, |current| ¤t != separator_tokens) { self.index.put_separator_tokens(self.wtxn, separator_tokens)?; - Ok(true) + true } else { - Ok(false) + false } } - Setting::Reset => Ok(self.index.delete_separator_tokens(self.wtxn)?), - Setting::NotSet => Ok(false), + Setting::Reset => self.index.delete_separator_tokens(self.wtxn)?, + Setting::NotSet => false, + }; + + // the synonyms must be updated if separator tokens have been updated. + if changes { + self.update_synonyms()?; } + + Ok(changes) } fn update_dictionary(&mut self) -> Result { - match self.dictionary { + let changes = match self.dictionary { Setting::Set(ref dictionary) => { let current = self.index.dictionary(self.wtxn)?; // Does the new list differ from the previous one? if current.map_or(true, |current| ¤t != dictionary) { self.index.put_dictionary(self.wtxn, dictionary)?; - Ok(true) + true } else { - Ok(false) + false } } - Setting::Reset => Ok(self.index.delete_dictionary(self.wtxn)?), - Setting::NotSet => Ok(false), + Setting::Reset => self.index.delete_dictionary(self.wtxn)?, + Setting::NotSet => false, + }; + + // the synonyms must be updated if dictionary has been updated. + if changes { + self.update_synonyms()?; } + + Ok(changes) } fn update_synonyms(&mut self) -> Result { @@ -565,6 +586,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { if let Some(ref stop_words) = stop_words { builder.stop_words(stop_words); } + + let separators = self.index.allowed_separators(self.wtxn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref separators) = separators { + builder.separators(separators); + } + + let dictionary = self.index.dictionary(self.wtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref dictionary) = dictionary { + builder.words_dict(dictionary); + } + let tokenizer = builder.build(); let mut new_synonyms = HashMap::new();