Unify facet strings by their normalized value

This commit is contained in:
Louis Dureuil 2025-01-21 00:11:50 +01:00
parent 1c78447226
commit d3a7e10348
No known key found for this signature in database

View File

@ -283,42 +283,60 @@ impl FacetedDocidsExtractor {
} }
struct DelAddFacetValue<'doc> { struct DelAddFacetValue<'doc> {
strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, strings: HashMap<
(FieldId, &'doc str),
Option<BVec<'doc, u8>>,
hashbrown::DefaultHashBuilder,
&'doc Bump,
>,
f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
doc_alloc: &'doc Bump,
} }
impl<'doc> DelAddFacetValue<'doc> { impl<'doc> DelAddFacetValue<'doc> {
fn new(doc_alloc: &'doc Bump) -> Self { fn new(doc_alloc: &'doc Bump) -> Self {
Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc }
} }
fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind { match kind {
FacetKind::String => &mut self.strings, FacetKind::Number => {
FacetKind::Number => &mut self.f64s, let key = (fid, value);
_ => return, if let Some(DelAdd::Deletion) = self.f64s.get(&key) {
}; self.f64s.remove(&key);
} else {
let key = (fid, value); self.f64s.insert(key, DelAdd::Addition);
if let Some(DelAdd::Deletion) = cache.get(&key) { }
cache.remove(&key); }
} else { FacetKind::String => {
cache.insert(key, DelAdd::Addition); if let Ok(s) = std::str::from_utf8(&value) {
let normalized = crate::normalize_facet(s);
let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
self.strings.insert((fid, truncated), Some(value));
}
}
_ => (),
} }
} }
fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind { match kind {
FacetKind::String => &mut self.strings, FacetKind::Number => {
FacetKind::Number => &mut self.f64s, let key = (fid, value);
_ => return, if let Some(DelAdd::Addition) = self.f64s.get(&key) {
}; self.f64s.remove(&key);
} else {
let key = (fid, value); self.f64s.insert(key, DelAdd::Deletion);
if let Some(DelAdd::Addition) = cache.get(&key) { }
cache.remove(&key); }
} else { FacetKind::String => {
cache.insert(key, DelAdd::Deletion); if let Ok(s) = std::str::from_utf8(&value) {
let normalized = crate::normalize_facet(s);
let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
self.strings.insert((fid, truncated), None);
}
}
_ => (),
} }
} }
@ -329,18 +347,14 @@ impl<'doc> DelAddFacetValue<'doc> {
doc_alloc: &Bump, doc_alloc: &Bump,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
for ((fid, value), deladd) in self.strings { for ((fid, truncated), value) in self.strings {
if let Ok(s) = std::str::from_utf8(&value) { buffer.clear();
buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes());
buffer.extend_from_slice(&fid.to_be_bytes()); buffer.extend_from_slice(&docid.to_be_bytes());
buffer.extend_from_slice(&docid.to_be_bytes()); buffer.extend_from_slice(truncated.as_bytes());
let normalized = crate::normalize_facet(s); match &value {
let truncated = truncate_str(&normalized); Some(value) => sender.write_facet_string(&buffer, value)?,
buffer.extend_from_slice(truncated.as_bytes()); None => sender.delete_facet_string(&buffer)?,
match deladd {
DelAdd::Deletion => sender.delete_facet_string(&buffer)?,
DelAdd::Addition => sender.write_facet_string(&buffer, &value)?,
}
} }
} }