normalize synonyms during indexation

This commit is contained in:
many 2021-01-12 13:53:32 +01:00
parent fa40c6e3d4
commit 06b2a587af
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
2 changed files with 100 additions and 2 deletions

View File

@ -1,9 +1,10 @@
use std::collections::{BTreeMap, BTreeSet}; use std::{borrow::Cow, collections::{BTreeMap, BTreeSet}};
use heed::Result as ZResult; use heed::Result as ZResult;
use fst::{set::OpBuilder, SetBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf; use sdset::SetBuf;
use meilisearch_schema::Schema; use meilisearch_schema::Schema;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use crate::database::{MainT, UpdateT}; use crate::database::{MainT, UpdateT};
use crate::settings::{UpdateState, SettingsUpdate, RankingRule}; use crate::settings::{UpdateState, SettingsUpdate, RankingRule};
@ -289,13 +290,27 @@ pub fn apply_synonyms_update(
let main_store = index.main; let main_store = index.main;
let synonyms_store = index.synonyms; let synonyms_store = index.synonyms;
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
fn normalize<T: AsRef<[u8]>>(analyzer: &Analyzer<T>, text: &str) -> String {
analyzer.analyze(&text)
.tokens()
.fold(String::new(), |mut s, t| {
s.push_str(&t.word);
s
})
}
let mut synonyms_builder = SetBuilder::memory(); let mut synonyms_builder = SetBuilder::memory();
synonyms_store.clear(writer)?; synonyms_store.clear(writer)?;
for (word, alternatives) in synonyms.clone() { for (word, alternatives) in synonyms {
let word = normalize(&analyzer, &word);
synonyms_builder.insert(&word)?; synonyms_builder.insert(&word)?;
let alternatives = { let alternatives = {
let alternatives = alternatives.iter().map(|text| normalize(&analyzer, &text)).collect();
let alternatives = SetBuf::from_dirty(alternatives); let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory(); let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives)?; alternatives_builder.extend_iter(alternatives)?;

View File

@ -167,6 +167,89 @@ async fn search_with_settings_stop_words() {
async fn search_with_settings_synonyms() { async fn search_with_settings_synonyms() {
let mut server = common::Server::test_server().await; let mut server = common::Server::test_server().await;
let config = json!({
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"wordsPosition",
"desc(age)",
"exactness",
"desc(balance)"
],
"distinctAttribute": null,
"searchableAttributes": [
"name",
"age",
"color",
"gender",
"email",
"address",
"about"
],
"displayedAttributes": [
"name",
"age",
"gender",
"color",
"email",
"phone",
"address",
"balance"
],
"stopWords": null,
"synonyms": {
"Application": [
"Exercitation"
]
},
});
server.update_all_settings(config).await;
let query = "q=application&limit=3";
let expect = json!([
{
"balance": "$1,921.58",
"age": 31,
"color": "Green",
"name": "Harper Carson",
"gender": "male",
"email": "harpercarson@chorizon.com",
"phone": "+1 (912) 430-3243",
"address": "883 Dennett Place, Knowlton, New Mexico, 9219"
},
{
"balance": "$1,706.13",
"age": 27,
"color": "Green",
"name": "Cherry Orr",
"gender": "female",
"email": "cherryorr@chorizon.com",
"phone": "+1 (995) 479-3174",
"address": "442 Beverly Road, Ventress, New Mexico, 3361"
},
{
"balance": "$1,476.39",
"age": 28,
"color": "brown",
"name": "Maureen Dale",
"gender": "female",
"email": "maureendale@chorizon.com",
"phone": "+1 (984) 538-3684",
"address": "817 Newton Street, Bannock, Wyoming, 1468"
}
]);
let (response, _status_code) = server.search_get(query).await;
assert_json_eq!(expect, response["hits"].clone(), ordered: false);
}
#[actix_rt::test]
async fn search_with_settings_normalized_synonyms() {
let mut server = common::Server::test_server().await;
let config = json!({ let config = json!({
"rankingRules": [ "rankingRules": [
"typo", "typo",