From 06b2a587affa0ff4c06a4aff81724f96c8b1e719 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 12 Jan 2021 13:53:32 +0100 Subject: [PATCH] normalize synonyms during indexation --- .../src/update/settings_update.rs | 19 ++++- meilisearch-http/tests/search_settings.rs | 83 +++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/meilisearch-core/src/update/settings_update.rs b/meilisearch-core/src/update/settings_update.rs index 7b82c1c6e..205ef216a 100644 --- a/meilisearch-core/src/update/settings_update.rs +++ b/meilisearch-core/src/update/settings_update.rs @@ -1,9 +1,10 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::{borrow::Cow, collections::{BTreeMap, BTreeSet}}; use heed::Result as ZResult; use fst::{set::OpBuilder, SetBuilder}; use sdset::SetBuf; use meilisearch_schema::Schema; +use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use crate::database::{MainT, UpdateT}; use crate::settings::{UpdateState, SettingsUpdate, RankingRule}; @@ -289,13 +290,27 @@ pub fn apply_synonyms_update( let main_store = index.main; let synonyms_store = index.synonyms; + let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?; + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + + fn normalize>(analyzer: &Analyzer, text: &str) -> String { + analyzer.analyze(&text) + .tokens() + .fold(String::new(), |mut s, t| { + s.push_str(&t.word); + s + }) + } let mut synonyms_builder = SetBuilder::memory(); synonyms_store.clear(writer)?; - for (word, alternatives) in synonyms.clone() { + for (word, alternatives) in synonyms { + let word = normalize(&analyzer, &word); + synonyms_builder.insert(&word)?; let alternatives = { + let alternatives = alternatives.iter().map(|text| normalize(&analyzer, &text)).collect(); let alternatives = SetBuf::from_dirty(alternatives); let mut alternatives_builder = SetBuilder::memory(); alternatives_builder.extend_iter(alternatives)?; diff --git a/meilisearch-http/tests/search_settings.rs b/meilisearch-http/tests/search_settings.rs index 46417498d..97d27023a 100644 --- a/meilisearch-http/tests/search_settings.rs +++ b/meilisearch-http/tests/search_settings.rs @@ -167,6 +167,89 @@ async fn search_with_settings_stop_words() { async fn search_with_settings_synonyms() { let mut server = common::Server::test_server().await; + let config = json!({ + "rankingRules": [ + "typo", + "words", + "proximity", + "attribute", + "wordsPosition", + "desc(age)", + "exactness", + "desc(balance)" + ], + "distinctAttribute": null, + "searchableAttributes": [ + "name", + "age", + "color", + "gender", + "email", + "address", + "about" + ], + "displayedAttributes": [ + "name", + "age", + "gender", + "color", + "email", + "phone", + "address", + "balance" + ], + "stopWords": null, + "synonyms": { + "Application": [ + "Exercitation" + ] + }, + }); + + server.update_all_settings(config).await; + + let query = "q=application&limit=3"; + let expect = json!([ + { + "balance": "$1,921.58", + "age": 31, + "color": "Green", + "name": "Harper Carson", + "gender": "male", + "email": "harpercarson@chorizon.com", + "phone": "+1 (912) 430-3243", + "address": "883 Dennett Place, Knowlton, New Mexico, 9219" + }, + { + "balance": "$1,706.13", + "age": 27, + "color": "Green", + "name": "Cherry Orr", + "gender": "female", + "email": "cherryorr@chorizon.com", + "phone": "+1 (995) 479-3174", + "address": "442 Beverly Road, Ventress, New Mexico, 3361" + }, + { + "balance": "$1,476.39", + "age": 28, + "color": "brown", + "name": "Maureen Dale", + "gender": "female", + "email": "maureendale@chorizon.com", + "phone": "+1 (984) 538-3684", + "address": "817 Newton Street, Bannock, Wyoming, 1468" + } + ]); + + let (response, _status_code) = server.search_get(query).await; + assert_json_eq!(expect, response["hits"].clone(), ordered: false); +} + +#[actix_rt::test] +async fn search_with_settings_normalized_synonyms() { + let mut server = common::Server::test_server().await; + let config = json!({ "rankingRules": [ "typo",