mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 17:11:15 +08:00
Merge #1184
1184: normalize synonyms during indexation r=MarinPostma a=LegendreM fix #1135 #964 Normalizes the synonyms before indexing them, so they are not case sensitive anymore. Then normalization also involves deunicoding is some cases, such as accents, so `été` and `ete` are considered equivalent in a search for synonyms. Co-authored-by: many <maxime@meilisearch.com> Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
commit
81e9fd8933
@ -1,9 +1,10 @@
|
|||||||
use std::collections::{BTreeMap, BTreeSet};
|
use std::{borrow::Cow, collections::{BTreeMap, BTreeSet}};
|
||||||
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
use fst::{set::OpBuilder, SetBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use meilisearch_schema::Schema;
|
use meilisearch_schema::Schema;
|
||||||
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
|
|
||||||
use crate::database::{MainT, UpdateT};
|
use crate::database::{MainT, UpdateT};
|
||||||
use crate::settings::{UpdateState, SettingsUpdate, RankingRule};
|
use crate::settings::{UpdateState, SettingsUpdate, RankingRule};
|
||||||
@ -289,13 +290,24 @@ pub fn apply_synonyms_update(
|
|||||||
|
|
||||||
let main_store = index.main;
|
let main_store = index.main;
|
||||||
let synonyms_store = index.synonyms;
|
let synonyms_store = index.synonyms;
|
||||||
|
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||||
|
|
||||||
|
fn normalize<T: AsRef<[u8]>>(analyzer: &Analyzer<T>, text: &str) -> String {
|
||||||
|
analyzer.analyze(&text)
|
||||||
|
.tokens()
|
||||||
|
.fold(String::new(), |s, t| s + t.text())
|
||||||
|
}
|
||||||
|
|
||||||
let mut synonyms_builder = SetBuilder::memory();
|
let mut synonyms_builder = SetBuilder::memory();
|
||||||
synonyms_store.clear(writer)?;
|
synonyms_store.clear(writer)?;
|
||||||
for (word, alternatives) in synonyms.clone() {
|
for (word, alternatives) in synonyms {
|
||||||
|
let word = normalize(&analyzer, &word);
|
||||||
|
|
||||||
synonyms_builder.insert(&word)?;
|
synonyms_builder.insert(&word)?;
|
||||||
|
|
||||||
let alternatives = {
|
let alternatives = {
|
||||||
|
let alternatives = alternatives.iter().map(|text| normalize(&analyzer, &text)).collect();
|
||||||
let alternatives = SetBuf::from_dirty(alternatives);
|
let alternatives = SetBuf::from_dirty(alternatives);
|
||||||
let mut alternatives_builder = SetBuilder::memory();
|
let mut alternatives_builder = SetBuilder::memory();
|
||||||
alternatives_builder.extend_iter(alternatives)?;
|
alternatives_builder.extend_iter(alternatives)?;
|
||||||
|
@ -167,6 +167,89 @@ async fn search_with_settings_stop_words() {
|
|||||||
async fn search_with_settings_synonyms() {
|
async fn search_with_settings_synonyms() {
|
||||||
let mut server = common::Server::test_server().await;
|
let mut server = common::Server::test_server().await;
|
||||||
|
|
||||||
|
let config = json!({
|
||||||
|
"rankingRules": [
|
||||||
|
"typo",
|
||||||
|
"words",
|
||||||
|
"proximity",
|
||||||
|
"attribute",
|
||||||
|
"wordsPosition",
|
||||||
|
"desc(age)",
|
||||||
|
"exactness",
|
||||||
|
"desc(balance)"
|
||||||
|
],
|
||||||
|
"distinctAttribute": null,
|
||||||
|
"searchableAttributes": [
|
||||||
|
"name",
|
||||||
|
"age",
|
||||||
|
"color",
|
||||||
|
"gender",
|
||||||
|
"email",
|
||||||
|
"address",
|
||||||
|
"about"
|
||||||
|
],
|
||||||
|
"displayedAttributes": [
|
||||||
|
"name",
|
||||||
|
"age",
|
||||||
|
"gender",
|
||||||
|
"color",
|
||||||
|
"email",
|
||||||
|
"phone",
|
||||||
|
"address",
|
||||||
|
"balance"
|
||||||
|
],
|
||||||
|
"stopWords": null,
|
||||||
|
"synonyms": {
|
||||||
|
"Application": [
|
||||||
|
"Exercitation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
server.update_all_settings(config).await;
|
||||||
|
|
||||||
|
let query = "q=application&limit=3";
|
||||||
|
let expect = json!([
|
||||||
|
{
|
||||||
|
"balance": "$1,921.58",
|
||||||
|
"age": 31,
|
||||||
|
"color": "Green",
|
||||||
|
"name": "Harper Carson",
|
||||||
|
"gender": "male",
|
||||||
|
"email": "harpercarson@chorizon.com",
|
||||||
|
"phone": "+1 (912) 430-3243",
|
||||||
|
"address": "883 Dennett Place, Knowlton, New Mexico, 9219"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"balance": "$1,706.13",
|
||||||
|
"age": 27,
|
||||||
|
"color": "Green",
|
||||||
|
"name": "Cherry Orr",
|
||||||
|
"gender": "female",
|
||||||
|
"email": "cherryorr@chorizon.com",
|
||||||
|
"phone": "+1 (995) 479-3174",
|
||||||
|
"address": "442 Beverly Road, Ventress, New Mexico, 3361"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"balance": "$1,476.39",
|
||||||
|
"age": 28,
|
||||||
|
"color": "brown",
|
||||||
|
"name": "Maureen Dale",
|
||||||
|
"gender": "female",
|
||||||
|
"email": "maureendale@chorizon.com",
|
||||||
|
"phone": "+1 (984) 538-3684",
|
||||||
|
"address": "817 Newton Street, Bannock, Wyoming, 1468"
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (response, _status_code) = server.search_get(query).await;
|
||||||
|
assert_json_eq!(expect, response["hits"].clone(), ordered: false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn search_with_settings_normalized_synonyms() {
|
||||||
|
let mut server = common::Server::test_server().await;
|
||||||
|
|
||||||
let config = json!({
|
let config = json!({
|
||||||
"rankingRules": [
|
"rankingRules": [
|
||||||
"typo",
|
"typo",
|
||||||
|
Loading…
Reference in New Issue
Block a user