From 8bf89ec394e60b35e75637a73ce0be718789f0c7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 21 Aug 2024 10:47:40 +0200 Subject: [PATCH 1/2] Infer locales from index settings --- meilisearch/tests/search/locales.rs | 331 +++++++++++++++++++++++- milli/src/localized_attributes_rules.rs | 15 ++ milli/src/search/new/mod.rs | 42 ++- 3 files changed, 381 insertions(+), 7 deletions(-) diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 9f1c22b75..dbc4fcc30 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -386,12 +386,39 @@ async fn force_locales() { |response, code| { snapshot!(response, @r###" { - "hits": [], + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], "query": "\"进击的巨人\"", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 0 + "estimatedTotalHits": 1 } "###); snapshot!(code, @"200 OK"); @@ -483,12 +510,39 @@ async fn force_locales_with_pattern() { |response, code| { snapshot!(response, @r###" { - "hits": [], + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], "query": "\"进击的巨人\"", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 0 + "estimatedTotalHits": 1 } "###); snapshot!(code, @"200 OK"); @@ -761,6 +815,275 @@ async fn force_different_locales_with_pattern() { .await; } +#[actix_rt::test] +async fn auto_infer_locales_at_search_with_attributes_to_search_on() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_zh"], "locales": ["jpn"]}, + // force chinese + {"attributePatterns": ["*_ja"], "locales": ["cmn"]}, + // any language + {"attributePatterns": ["*_en"], "locales": []} + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // auto infer any language + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // should infer chinese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"], "attributesToSearchOn": ["name_zh", "description_zh"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn auto_infer_locales_at_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*"], "locales": ["jpn"]}, + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + #[actix_rt::test] async fn force_different_locales_with_pattern_nested() { let server = Server::new().await; diff --git a/milli/src/localized_attributes_rules.rs b/milli/src/localized_attributes_rules.rs index 739d03043..3c421ca6b 100644 --- a/milli/src/localized_attributes_rules.rs +++ b/milli/src/localized_attributes_rules.rs @@ -90,6 +90,21 @@ impl LocalizedFieldIds { pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) } + + pub fn all_locales(&self) -> Vec { + let mut locales = Vec::new(); + for field_locales in self.field_id_to_locales.values() { + if !field_locales.is_empty() { + locales.extend(field_locales); + } else { + // If a field has no locales, we consider it as not localized + return Vec::new(); + } + } + locales.sort(); + locales.dedup(); + locales + } } #[cfg(test)] diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 577e12a39..b30306a0b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; +use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; @@ -671,9 +672,44 @@ pub fn execute_search( tokbuilder.words_dict(dictionary); } - if let Some(locales) = locales { - tokbuilder.allow_list(locales); - } + let db_locales; + match locales { + Some(locales) => { + if !locales.is_empty() { + tokbuilder.allow_list(locales); + } + } + None => { + // If no locales are specified, we use the locales specified in the localized attributes rules + let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?; + let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?; + let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?; + + let localized_fields = match &ctx.restricted_fids { + // if AttributeToSearchOn is set, use the restricted list of ids + Some(restricted_fids) => { + let iter = restricted_fids + .exact + .iter() + .chain(restricted_fids.tolerant.iter()) + .map(|(fid, _)| *fid); + + LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter) + } + // Otherwise use the full list of ids coming from the index searchable fields + None => LocalizedFieldIds::new( + &localized_attributes_rules, + &fields_ids_map, + searchable_fields.into_iter(), + ), + }; + + db_locales = localized_fields.all_locales(); + if !db_locales.is_empty() { + tokbuilder.allow_list(&db_locales); + } + } + }; let tokenizer = tokbuilder.build(); drop(entered); From b12e997c8ae79f0e98c1bc2ccb942a4de199c95a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 21 Aug 2024 13:22:56 +0200 Subject: [PATCH 2/2] Add pinyin flag --- meilisearch/tests/search/mod.rs | 2 ++ milli/src/search/mod.rs | 1 + milli/src/search/new/tests/mod.rs | 1 + 3 files changed, 4 insertions(+) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 301ef9aa2..a05b27947 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -7,6 +7,7 @@ mod facet_search; mod formatted; mod geo; mod hybrid; +#[cfg(not(feature = "chinese-pinyin"))] mod locales; mod matching_strategy; mod multi; @@ -392,6 +393,7 @@ async fn negative_special_cases_search() { } #[cfg(feature = "default")] +#[cfg(not(feature = "chinese-pinyin"))] #[actix_rt::test] async fn test_kanji_language_detection() { let server = Server::new().await; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0f5eb23e1..3057066d2 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -360,6 +360,7 @@ mod test { use super::*; #[cfg(feature = "japanese")] + #[cfg(not(feature = "chinese-pinyin"))] #[test] fn test_kanji_language_detection() { use crate::index::tests::TempIndex; diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 0faff9425..37bca7597 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -6,6 +6,7 @@ pub mod exactness; pub mod geo_sort; pub mod integration; #[cfg(feature = "all-tokenizations")] +#[cfg(not(feature = "chinese-pinyin"))] pub mod language; pub mod ngram_split_words; pub mod proximity;