3621: Fix facet normalization r=Kerollmops a=ManyTheFish

# Pull Request

Make sure the facet normalization is the same between indexing and search.

## Related issue
Fixes #3599



Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2023-03-29 12:47:20 +00:00 committed by GitHub
commit 37a24a4a05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 43 additions and 41 deletions

View File

@ -14,7 +14,7 @@ async fn formatted_contain_wildcard() {
index.add_documents(documents, None).await; index.add_documents(documents, None).await;
index.wait_task(1).await; index.wait_task(1).await;
index.search(json!({ "q": "pesti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "showMatchesPosition": true }), index.search(json!({ "q": "pésti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "showMatchesPosition": true }),
|response, code| |response, code|
{ {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
@ -23,7 +23,7 @@ async fn formatted_contain_wildcard() {
json!({ json!({
"_formatted": { "_formatted": {
"id": "852", "id": "852",
"cattos": "<em>pesti</em>", "cattos": "<em>pésti</em>",
}, },
"_matchesPosition": {"cattos": [{"start": 0, "length": 5}]}, "_matchesPosition": {"cattos": [{"start": 0, "length": 5}]},
}) })
@ -33,13 +33,13 @@ async fn formatted_contain_wildcard() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": ["*"] }), |response, code| { .search(json!({ "q": "pésti", "attributesToRetrieve": ["*"] }), |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
response["hits"][0], response["hits"][0],
json!({ json!({
"id": 852, "id": 852,
"cattos": "pesti", "cattos": "pésti",
}) })
); );
}) })
@ -47,17 +47,17 @@ async fn formatted_contain_wildcard() {
index index
.search( .search(
json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"], "showMatchesPosition": true }), json!({ "q": "pésti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"], "showMatchesPosition": true }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
response["hits"][0], response["hits"][0],
json!({ json!({
"id": 852, "id": 852,
"cattos": "pesti", "cattos": "pésti",
"_formatted": { "_formatted": {
"id": "852", "id": "852",
"cattos": "pesti", "cattos": "pésti",
}, },
"_matchesPosition": {"cattos": [{"start": 0, "length": 5}]}, "_matchesPosition": {"cattos": [{"start": 0, "length": 5}]},
}) })
@ -68,17 +68,17 @@ async fn formatted_contain_wildcard() {
index index
.search( .search(
json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToCrop": ["*"] }), json!({ "q": "pésti", "attributesToRetrieve": ["*"], "attributesToCrop": ["*"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
response["hits"][0], response["hits"][0],
json!({ json!({
"id": 852, "id": 852,
"cattos": "pesti", "cattos": "pésti",
"_formatted": { "_formatted": {
"id": "852", "id": "852",
"cattos": "pesti", "cattos": "pésti",
} }
}) })
); );
@ -87,16 +87,16 @@ async fn formatted_contain_wildcard() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToCrop": ["*"] }), |response, code| { .search(json!({ "q": "pésti", "attributesToCrop": ["*"] }), |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
response["hits"][0], response["hits"][0],
json!({ json!({
"id": 852, "id": 852,
"cattos": "pesti", "cattos": "pésti",
"_formatted": { "_formatted": {
"id": "852", "id": "852",
"cattos": "pesti", "cattos": "pésti",
} }
}) })
); );
@ -114,7 +114,7 @@ async fn format_nested() {
index.wait_task(0).await; index.wait_task(0).await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": ["doggos"] }), |response, code| { .search(json!({ "q": "pésti", "attributesToRetrieve": ["doggos"] }), |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
response["hits"][0], response["hits"][0],
@ -136,7 +136,7 @@ async fn format_nested() {
index index
.search( .search(
json!({ "q": "pesti", "attributesToRetrieve": ["doggos.name"] }), json!({ "q": "pésti", "attributesToRetrieve": ["doggos.name"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
@ -180,7 +180,7 @@ async fn format_nested() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.name"] }), .search(json!({ "q": "pésti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.name"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
@ -202,7 +202,7 @@ async fn format_nested() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": [], "attributesToCrop": ["doggos.name"] }), .search(json!({ "q": "pésti", "attributesToRetrieve": [], "attributesToCrop": ["doggos.name"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
@ -224,7 +224,7 @@ async fn format_nested() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": ["doggos.name"], "attributesToHighlight": ["doggos.age"] }), .search(json!({ "q": "pésti", "attributesToRetrieve": ["doggos.name"], "attributesToHighlight": ["doggos.age"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(
@ -256,7 +256,7 @@ async fn format_nested() {
.await; .await;
index index
.search(json!({ "q": "pesti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.age"], "attributesToCrop": ["doggos.name"] }), .search(json!({ "q": "pésti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.age"], "attributesToCrop": ["doggos.name"] }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!( assert_eq!(

View File

@ -30,7 +30,7 @@ pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
"id": "166428", "id": "166428",
}, },
{ {
"title": "Glass", "title": "Gläss",
"id": "450465", "id": "450465",
} }
]) ])
@ -52,7 +52,7 @@ pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
"age": 4, "age": 4,
}, },
], ],
"cattos": "pesti", "cattos": "pésti",
}, },
{ {
"id": 654, "id": 654,
@ -142,7 +142,7 @@ async fn simple_search() {
index.wait_task(1).await; index.wait_task(1).await;
index index
.search(json!({"q": "pesti"}), |response, code| { .search(json!({"q": "pésti"}), |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
assert_eq!(response["hits"].as_array().unwrap().len(), 2); assert_eq!(response["hits"].as_array().unwrap().len(), 2);
}) })
@ -250,7 +250,7 @@ async fn search_multiple_params() {
index index
.search( .search(
json!({ json!({
"q": "pesti", "q": "pésti",
"attributesToCrop": ["catto:2"], "attributesToCrop": ["catto:2"],
"attributesToHighlight": ["catto"], "attributesToHighlight": ["catto"],
"limit": 2, "limit": 2,
@ -281,7 +281,7 @@ async fn search_with_filter_string_notation() {
index index
.search( .search(
json!({ json!({
"filter": "title = Glass" "filter": "title = Gläss"
}), }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
@ -305,7 +305,7 @@ async fn search_with_filter_string_notation() {
index index
.search( .search(
json!({ json!({
"filter": "cattos = pesti" "filter": "cattos = pésti"
}), }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
@ -343,7 +343,7 @@ async fn search_with_filter_array_notation() {
let (response, code) = index let (response, code) = index
.search_post(json!({ .search_post(json!({
"filter": ["title = Glass"] "filter": ["title = Gläss"]
})) }))
.await; .await;
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
@ -351,7 +351,7 @@ async fn search_with_filter_array_notation() {
let (response, code) = index let (response, code) = index
.search_post(json!({ .search_post(json!({
"filter": [["title = Glass", "title = \"Shazam!\"", "title = \"Escape Room\""]] "filter": [["title = Gläss", "title = \"Shazam!\"", "title = \"Escape Room\""]]
})) }))
.await; .await;
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);

View File

@ -71,7 +71,7 @@ async fn simple_search_single_index() {
"indexUid": "test", "indexUid": "test",
"hits": [ "hits": [
{ {
"title": "Glass", "title": "Gläss",
"id": "450465" "id": "450465"
} }
], ],
@ -166,7 +166,7 @@ async fn simple_search_two_indexes() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"queries": [ .multi_search(json!({"queries": [
{"indexUid" : "test", "q": "glass"}, {"indexUid" : "test", "q": "glass"},
{"indexUid": "nested", "q": "pesti"}, {"indexUid": "nested", "q": "pésti"},
]})) ]}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -176,7 +176,7 @@ async fn simple_search_two_indexes() {
"indexUid": "test", "indexUid": "test",
"hits": [ "hits": [
{ {
"title": "Glass", "title": "Gläss",
"id": "450465" "id": "450465"
} }
], ],
@ -203,7 +203,7 @@ async fn simple_search_two_indexes() {
"age": 4 "age": 4
} }
], ],
"cattos": "pesti" "cattos": "pésti"
}, },
{ {
"id": 654, "id": 654,
@ -221,7 +221,7 @@ async fn simple_search_two_indexes() {
] ]
} }
], ],
"query": "pesti", "query": "pésti",
"processingTimeMs": "[time]", "processingTimeMs": "[time]",
"limit": 20, "limit": 20,
"offset": 0, "offset": 0,
@ -243,7 +243,7 @@ async fn search_one_index_doesnt_exist() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"queries": [ .multi_search(json!({"queries": [
{"indexUid" : "test", "q": "glass"}, {"indexUid" : "test", "q": "glass"},
{"indexUid": "nested", "q": "pesti"}, {"indexUid": "nested", "q": "pésti"},
]})) ]}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -264,7 +264,7 @@ async fn search_multiple_indexes_dont_exist() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"queries": [ .multi_search(json!({"queries": [
{"indexUid" : "test", "q": "glass"}, {"indexUid" : "test", "q": "glass"},
{"indexUid": "nested", "q": "pesti"}, {"indexUid": "nested", "q": "pésti"},
]})) ]}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -296,7 +296,7 @@ async fn search_one_query_error() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"queries": [ .multi_search(json!({"queries": [
{"indexUid" : "test", "q": "glass", "facets": ["title"]}, {"indexUid" : "test", "q": "glass", "facets": ["title"]},
{"indexUid": "nested", "q": "pesti"}, {"indexUid": "nested", "q": "pésti"},
]})) ]}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -328,7 +328,7 @@ async fn search_multiple_query_errors() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"queries": [ .multi_search(json!({"queries": [
{"indexUid" : "test", "q": "glass", "facets": ["title"]}, {"indexUid" : "test", "q": "glass", "facets": ["title"]},
{"indexUid": "nested", "q": "pesti", "facets": ["doggos"]}, {"indexUid": "nested", "q": "pésti", "facets": ["doggos"]},
]})) ]}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");

View File

@ -22,6 +22,7 @@ use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
pub use filter_parser::{Condition, FilterCondition, Span, Token}; pub use filter_parser::{Condition, FilterCondition, Span, Token};
use fxhash::{FxHasher32, FxHasher64}; use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType; pub use grenad::CompressionType;
@ -252,6 +253,10 @@ pub fn is_faceted_by(field: &str, facet: &str) -> bool {
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
} }
pub fn normalize_facet(original: &str) -> String {
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use serde_json::json; use serde_json::json;

View File

@ -230,7 +230,7 @@ impl<'a> Filter<'a> {
&FacetGroupKey { &FacetGroupKey {
field_id, field_id,
level: 0, level: 0,
left_bound: &val.value().to_lowercase(), left_bound: &crate::normalize_facet(val.value()),
}, },
)? )?
.map(|v| v.bitmap) .map(|v| v.bitmap)

View File

@ -4,7 +4,6 @@ use std::fs::File;
use std::io; use std::io;
use std::mem::size_of; use std::mem::size_of;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
use heed::zerocopy::AsBytes; use heed::zerocopy::AsBytes;
use heed::BytesEncode; use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -136,9 +135,7 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
} }
} }
Value::String(original) => { Value::String(original) => {
let normalized = CompatibilityDecompositionNormalizer let normalized = crate::normalize_facet(original);
.normalize_str(original.trim())
.to_lowercase();
output_strings.push((normalized, original.clone())); output_strings.push((normalized, original.clone()));
} }
Value::Array(values) => { Value::Array(values) => {