From d3a7e10348e8e381a454e6f7a9e564dfe00ddb67 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 21 Jan 2025 00:11:50 +0100 Subject: [PATCH 1/3] Unify facet strings by their normalized value --- .../new/extract/faceted/extract_facets.rs | 86 +++++++++++-------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 66ed6cbfb..1a4fcd9f6 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -283,42 +283,60 @@ impl FacetedDocidsExtractor { } struct DelAddFacetValue<'doc> { - strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + strings: HashMap< + (FieldId, &'doc str), + Option>, + hashbrown::DefaultHashBuilder, + &'doc Bump, + >, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + doc_alloc: &'doc Bump, } impl<'doc> DelAddFacetValue<'doc> { fn new(doc_alloc: &'doc Bump) -> Self { - Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } + Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc } } fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Deletion) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Addition); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Deletion) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Addition); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), Some(value)); + } + } + _ => (), } } fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Addition) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Deletion); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Addition) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Deletion); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), None); + } + } + _ => (), } } @@ -329,18 +347,14 @@ impl<'doc> DelAddFacetValue<'doc> { doc_alloc: &Bump, ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); - for ((fid, value), deladd) in self.strings { - if let Ok(s) = std::str::from_utf8(&value) { - buffer.clear(); - buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.extend_from_slice(&docid.to_be_bytes()); - let normalized = crate::normalize_facet(s); - let truncated = truncate_str(&normalized); - buffer.extend_from_slice(truncated.as_bytes()); - match deladd { - DelAdd::Deletion => sender.delete_facet_string(&buffer)?, - DelAdd::Addition => sender.write_facet_string(&buffer, &value)?, - } + for ((fid, truncated), value) in self.strings { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + buffer.extend_from_slice(truncated.as_bytes()); + match &value { + Some(value) => sender.write_facet_string(&buffer, value)?, + None => sender.delete_facet_string(&buffer)?, } } From 145fa3a8ff105639bb0733e8e2a0ab9e6a0e562f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jan 2025 10:41:43 +0100 Subject: [PATCH 2/3] Add a test to check the facet casing is good --- crates/meilisearch/tests/search/mod.rs | 54 ++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 057b2b3a2..ae1bef7c4 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -1746,3 +1746,57 @@ async fn change_attributes_settings() { ) .await; } + +/// Modifying facets with different casing should work correctly +#[actix_rt::test] +async fn change_facet_casing() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "filterableAttributes": ["dog"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "Bouvier Bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "bouvier bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + index + .search(json!({ "facets": ["dog"] }), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["facetDistribution"]), @r###" + { + "dog": { + "bouvier bernois": 1 + } + } + "###); + }) + .await; +} From 024e06f7e3ff12e91b79bb3a0ad28fa779795382 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jan 2025 11:44:03 +0100 Subject: [PATCH 3/3] Do not panic when the facet string is not found --- .../src/search/facet/facet_distribution.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index 027f8b176..ee0fad535 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -219,12 +219,19 @@ impl<'a> FacetDistribution<'a> { let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); - let original_string = self - .index - .field_id_docid_facet_strings - .get(self.rtxn, &key)? - .unwrap() - .to_owned(); + let optional_original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?; + + let original_string = match optional_original_string { + Some(original_string) => original_string.to_owned(), + None => { + tracing::error!( + "Missing original facet string. Using the normalized facet {} instead", + facet_key + ); + facet_key.to_string() + } + }; distribution.insert(original_string, nbr_docids); if distribution.len() == self.max_values_per_facet {