5258: Unify facet strings by their normalized value r=ManyTheFish a=dureuill

Fixes #5228: the "missing facet keys" issue.

- Before this PR, updating a document such that `"facet": "DUREUILL"` would become `"facet": "dureuill"` could cause the normalized facet value `dureuill` to be removed from `field_id_docid_facet_strings` db.
- This PR makes sure to unify the intermediate representation of the facet strings by their field_id and **normalized** (and truncated) string value.
- The introduced test is testing only one of the two facet distribution algorithms.
- We removed the panic when the facet string was not found, and we instead returned the normalized string.

## Draft status

- [x] target release v1.12.6 branch and milestone
- [ ] ~consider meilitool offline upgrade to fix the corrupted dbs in the wild.~
   workaround: ~remove facets, then add them again... if your facet distribution is right.~ Just use a dump.
- [x] Add unit test demonstrating the issue fixed by this PR.

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2025-01-21 11:02:33 +00:00 committed by GitHub
commit c575d2693b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 117 additions and 42 deletions

View File

@ -1746,3 +1746,57 @@ async fn change_attributes_settings() {
) )
.await; .await;
} }
/// Modifying facets with different casing should work correctly
#[actix_rt::test]
async fn change_facet_casing() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index
.update_settings(json!({
"filterableAttributes": ["dog"],
}))
.await;
assert_eq!("202", code.as_str(), "{:?}", response);
index.wait_task(response.uid()).await;
let (response, _code) = index
.add_documents(
json!([
{
"id": 1,
"dog": "Bouvier Bernois"
}
]),
None,
)
.await;
index.wait_task(response.uid()).await;
let (response, _code) = index
.add_documents(
json!([
{
"id": 1,
"dog": "bouvier bernois"
}
]),
None,
)
.await;
index.wait_task(response.uid()).await;
index
.search(json!({ "facets": ["dog"] }), |response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["facetDistribution"]), @r###"
{
"dog": {
"bouvier bernois": 1
}
}
"###);
})
.await;
}

View File

@ -219,12 +219,19 @@ impl<'a> FacetDistribution<'a> {
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
let original_string = self let optional_original_string =
.index self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?;
.field_id_docid_facet_strings
.get(self.rtxn, &key)? let original_string = match optional_original_string {
.unwrap() Some(original_string) => original_string.to_owned(),
.to_owned(); None => {
tracing::error!(
"Missing original facet string. Using the normalized facet {} instead",
facet_key
);
facet_key.to_string()
}
};
distribution.insert(original_string, nbr_docids); distribution.insert(original_string, nbr_docids);
if distribution.len() == self.max_values_per_facet { if distribution.len() == self.max_values_per_facet {

View File

@ -283,42 +283,60 @@ impl FacetedDocidsExtractor {
} }
struct DelAddFacetValue<'doc> { struct DelAddFacetValue<'doc> {
strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, strings: HashMap<
(FieldId, &'doc str),
Option<BVec<'doc, u8>>,
hashbrown::DefaultHashBuilder,
&'doc Bump,
>,
f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
doc_alloc: &'doc Bump,
} }
impl<'doc> DelAddFacetValue<'doc> { impl<'doc> DelAddFacetValue<'doc> {
fn new(doc_alloc: &'doc Bump) -> Self { fn new(doc_alloc: &'doc Bump) -> Self {
Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc }
} }
fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind { match kind {
FacetKind::String => &mut self.strings, FacetKind::Number => {
FacetKind::Number => &mut self.f64s, let key = (fid, value);
_ => return, if let Some(DelAdd::Deletion) = self.f64s.get(&key) {
}; self.f64s.remove(&key);
} else {
let key = (fid, value); self.f64s.insert(key, DelAdd::Addition);
if let Some(DelAdd::Deletion) = cache.get(&key) { }
cache.remove(&key); }
} else { FacetKind::String => {
cache.insert(key, DelAdd::Addition); if let Ok(s) = std::str::from_utf8(&value) {
let normalized = crate::normalize_facet(s);
let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
self.strings.insert((fid, truncated), Some(value));
}
}
_ => (),
} }
} }
fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind { match kind {
FacetKind::String => &mut self.strings, FacetKind::Number => {
FacetKind::Number => &mut self.f64s, let key = (fid, value);
_ => return, if let Some(DelAdd::Addition) = self.f64s.get(&key) {
}; self.f64s.remove(&key);
} else {
let key = (fid, value); self.f64s.insert(key, DelAdd::Deletion);
if let Some(DelAdd::Addition) = cache.get(&key) { }
cache.remove(&key); }
} else { FacetKind::String => {
cache.insert(key, DelAdd::Deletion); if let Ok(s) = std::str::from_utf8(&value) {
let normalized = crate::normalize_facet(s);
let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
self.strings.insert((fid, truncated), None);
}
}
_ => (),
} }
} }
@ -329,18 +347,14 @@ impl<'doc> DelAddFacetValue<'doc> {
doc_alloc: &Bump, doc_alloc: &Bump,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
for ((fid, value), deladd) in self.strings { for ((fid, truncated), value) in self.strings {
if let Ok(s) = std::str::from_utf8(&value) { buffer.clear();
buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes());
buffer.extend_from_slice(&fid.to_be_bytes()); buffer.extend_from_slice(&docid.to_be_bytes());
buffer.extend_from_slice(&docid.to_be_bytes()); buffer.extend_from_slice(truncated.as_bytes());
let normalized = crate::normalize_facet(s); match &value {
let truncated = truncate_str(&normalized); Some(value) => sender.write_facet_string(&buffer, value)?,
buffer.extend_from_slice(truncated.as_bytes()); None => sender.delete_facet_string(&buffer)?,
match deladd {
DelAdd::Deletion => sender.delete_facet_string(&buffer)?,
DelAdd::Addition => sender.write_facet_string(&buffer, &value)?,
}
} }
} }