From efea1e5837571e7af4822b12b2a25af2530f0178 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 29 Mar 2023 10:57:02 +0200 Subject: [PATCH] Fix facet normalization --- milli/src/lib.rs | 5 +++++ milli/src/search/facet/filter.rs | 2 +- .../extract/extract_fid_docid_facet_values.rs | 5 +---- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 865195df5..e49e49d9c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,6 +22,7 @@ use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; +use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; @@ -252,6 +253,10 @@ pub fn is_faceted_by(field: &str, facet: &str) -> bool { && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) } +pub fn normalize_facet(original: &str) -> String { + CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase() +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index f67219494..c24abe6a5 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -230,7 +230,7 @@ impl<'a> Filter<'a> { &FacetGroupKey { field_id, level: 0, - left_bound: &val.value().to_lowercase(), + left_bound: &crate::normalize_facet(val.value()), }, )? .map(|v| v.bitmap) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 71ac330e2..f0bd78792 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -4,7 +4,6 @@ use std::fs::File; use std::io; use std::mem::size_of; -use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; use heed::zerocopy::AsBytes; use heed::BytesEncode; use roaring::RoaringBitmap; @@ -136,9 +135,7 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { } } Value::String(original) => { - let normalized = CompatibilityDecompositionNormalizer - .normalize_str(original.trim()) - .to_lowercase(); + let normalized = crate::normalize_facet(original); output_strings.push((normalized, original.clone())); } Value::Array(values) => {