3942: Normalize for the search the facets values r=ManyTheFish a=Kerollmops

This PR improves and fixes the search for facet values feature. Searching for _bre_ wasn't returning facet values like _brévent_ or _brô_.

The issue was related to the fact that facets are normalized but not in the same way as the `searchableAttributes` are. We decided to normalize them further and add another intermediate database where the key is the normalized facet value, and the value is a set of the non-normalized facets. We then use these non-normalized ones to get the correct counts by fetching the associated databases.

### What's missing in this PR?
 - [x] Apply the change to the whole set of `SearchForFacetValue::execute` conditions.
 - [x] Factorize the code that does an intermediate normalized value fetch in a function.
 - [x] Add or modify the search for facet value test.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-07-25 14:37:17 +00:00 committed by GitHub
commit be72be7c0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 279 additions and 94 deletions

View File

@ -1,3 +1,4 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde_json::{json, Value}; use serde_json::{json, Value};
@ -56,6 +57,54 @@ async fn simple_facet_search() {
assert_eq!(response["facetHits"].as_array().unwrap().len(), 1); assert_eq!(response["facetHits"].as_array().unwrap().len(), 1);
} }
#[actix_rt::test]
async fn advanced_facet_search() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.update_settings_filterable_attributes(json!(["genres"])).await;
index.update_settings_typo_tolerance(json!({ "enabled": false })).await;
index.add_documents(documents, None).await;
index.wait_task(2).await;
let (response, code) =
index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await;
snapshot!(code, @"200 OK");
snapshot!(response["facetHits"].as_array().unwrap().len(), @"0");
let (response, code) =
index.facet_search(json!({"facetName": "genres", "facetQuery": "àdventure"})).await;
snapshot!(code, @"200 OK");
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
}
#[actix_rt::test]
async fn more_advanced_facet_search() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.update_settings_filterable_attributes(json!(["genres"])).await;
index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await;
index.add_documents(documents, None).await;
index.wait_task(2).await;
let (response, code) =
index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await;
snapshot!(code, @"200 OK");
snapshot!(response["facetHits"].as_array().unwrap().len(), @"0");
let (response, code) =
index.facet_search(json!({"facetName": "genres", "facetQuery": "adventure"})).await;
snapshot!(code, @"200 OK");
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
}
#[actix_rt::test] #[actix_rt::test]
async fn non_filterable_facet_search_error() { async fn non_filterable_facet_search_error() {
let server = Server::new().await; let server = Server::new().await;

View File

@ -0,0 +1,27 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
pub struct BEU16StrCodec;
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
type DItem = (u16, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n_bytes, str_bytes) = bytes.split_at(2);
let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
let s = str::from_utf8(str_bytes).ok()?;
Some((n, s))
}
}
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
type EItem = (u16, &'a str);
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s.len() + 2);
bytes.extend_from_slice(&n.to_be_bytes());
bytes.extend_from_slice(s.as_bytes());
Some(Cow::Owned(bytes))
}
}

View File

@ -1,3 +1,4 @@
mod beu16_str_codec;
mod beu32_str_codec; mod beu32_str_codec;
mod byte_slice_ref; mod byte_slice_ref;
pub mod facet; pub mod facet;
@ -14,6 +15,7 @@ mod str_str_u8_codec;
pub use byte_slice_ref::ByteSliceRefCodec; pub use byte_slice_ref::ByteSliceRefCodec;
pub use str_ref::StrRefCodec; pub use str_ref::StrRefCodec;
pub use self::beu16_str_codec::BEU16StrCodec;
pub use self::beu32_str_codec::BEU32StrCodec; pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec; pub use self::fst_set_codec::FstSetCodec;

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::mem::size_of; use std::mem::size_of;
use std::path::Path; use std::path::Path;
@ -20,7 +20,9 @@ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec, FieldIdCodec, OrderedF64Codec,
}; };
use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::heed_codec::{
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
};
use crate::readable_slices::ReadableSlices; use crate::readable_slices::ReadableSlices;
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -95,6 +97,7 @@ pub mod db_name {
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings";
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst"; pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -156,6 +159,8 @@ pub struct Index {
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
/// Maps the facet field id and ranges of strings with the docids that corresponds to them. /// Maps the facet field id and ranges of strings with the docids that corresponds to them.
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>, pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
/// Maps the facet field id of the normalized-for-search string facets with their original versions.
pub facet_id_normalized_string_strings: Database<BEU16StrCodec, SerdeJson<BTreeSet<String>>>,
/// Maps the facet field id of the string facets with an FST containing all the facets values. /// Maps the facet field id of the string facets with an FST containing all the facets values.
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>, pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
@ -180,7 +185,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(24); options.max_dbs(25);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -210,6 +215,8 @@ impl Index {
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?; let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = let facet_id_string_docids =
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?; env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_normalized_string_strings =
env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?;
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?; let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
let facet_id_exists_docids = let facet_id_exists_docids =
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?; env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
@ -245,6 +252,7 @@ impl Index {
field_id_word_count_docids, field_id_word_count_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_normalized_string_strings,
facet_id_string_fst, facet_id_string_fst,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,

View File

@ -51,9 +51,10 @@ pub use self::error::{
pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap; pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{ pub use self::heed_codec::{
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
UncheckedU8StrStrCodec,
}; };
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{ pub use self::search::{

View File

@ -1,5 +1,8 @@
use std::fmt; use std::fmt;
use std::ops::ControlFlow;
use charabia::normalizer::NormalizerOption;
use charabia::Normalize;
use fst::automaton::{Automaton, Str}; use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
@ -14,8 +17,8 @@ use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::{ use crate::{
execute_search, normalize_facet, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result,
Result, SearchContext, BEU16, SearchContext, BEU16,
}; };
// Building these factories is not free. // Building these factories is not free.
@ -301,29 +304,28 @@ impl<'a> SearchForFacetValues<'a> {
match self.query.as_ref() { match self.query.as_ref() {
Some(query) => { Some(query) => {
let query = normalize_facet(query); let options = NormalizerOption { lossy: true, ..Default::default() };
let query = query.as_str(); let query = query.normalize(&options);
let query = query.as_ref();
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
let field_authorizes_typos = let field_authorizes_typos =
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
if authorize_typos && field_authorizes_typos { if authorize_typos && field_authorizes_typos {
let mut results = vec![];
let exact_words_fst = self.search_query.index.exact_words(rtxn)?; let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
if exact_words_fst.map_or(false, |fst| fst.contains(query)) { if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: query }; let mut results = vec![];
if let Some(FacetGroupValue { bitmap, .. }) = if fst.contains(query) {
index.facet_id_string_docids.get(rtxn, &key)? self.fetch_original_facets_using_normalized(
{ fid,
let count = search_candidates.intersection_len(&bitmap); query,
if count != 0 { query,
let value = self &search_candidates,
.one_original_value_of(fid, query, bitmap.min().unwrap())? &mut results,
.unwrap_or_else(|| query.to_string()); )?;
results.push(FacetValueHit { value, count });
}
} }
Ok(results)
} else { } else {
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
@ -338,60 +340,41 @@ impl<'a> SearchForFacetValues<'a> {
}; };
let mut stream = fst.search(automaton).into_stream(); let mut stream = fst.search(automaton).into_stream();
let mut length = 0; let mut results = vec![];
while let Some(facet_value) = stream.next() { while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?; let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value }; if self
let docids = match index.facet_id_string_docids.get(rtxn, &key)? { .fetch_original_facets_using_normalized(
Some(FacetGroupValue { bitmap, .. }) => bitmap, fid,
None => { value,
error!( query,
"the facet value is missing from the facet database: {key:?}" &search_candidates,
); &mut results,
continue; )?
} .is_break()
}; {
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, value, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break; break;
} }
} }
}
Ok(results) Ok(results)
}
} else { } else {
let automaton = Str::new(query).starts_with(); let automaton = Str::new(query).starts_with();
let mut stream = fst.search(automaton).into_stream(); let mut stream = fst.search(automaton).into_stream();
let mut results = vec![]; let mut results = vec![];
let mut length = 0;
while let Some(facet_value) = stream.next() { while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?; let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value }; if self
let docids = match index.facet_id_string_docids.get(rtxn, &key)? { .fetch_original_facets_using_normalized(
Some(FacetGroupValue { bitmap, .. }) => bitmap, fid,
None => { value,
error!( query,
"the facet value is missing from the facet database: {key:?}" &search_candidates,
); &mut results,
continue; )?
} .is_break()
}; {
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, value, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break; break;
} }
} }
@ -401,7 +384,6 @@ impl<'a> SearchForFacetValues<'a> {
} }
None => { None => {
let mut results = vec![]; let mut results = vec![];
let mut length = 0;
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
@ -412,9 +394,8 @@ impl<'a> SearchForFacetValues<'a> {
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())? .one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
.unwrap_or_else(|| left_bound.to_string()); .unwrap_or_else(|| left_bound.to_string());
results.push(FacetValueHit { value, count }); results.push(FacetValueHit { value, count });
length += 1;
} }
if length >= MAX_NUMBER_OF_FACETS { if results.len() >= MAX_NUMBER_OF_FACETS {
break; break;
} }
} }
@ -422,6 +403,50 @@ impl<'a> SearchForFacetValues<'a> {
} }
} }
} }
fn fetch_original_facets_using_normalized(
&self,
fid: FieldId,
value: &str,
query: &str,
search_candidates: &RoaringBitmap,
results: &mut Vec<FacetValueHit>,
) -> Result<ControlFlow<()>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let database = index.facet_id_normalized_string_strings;
let key = (fid, value);
let original_strings = match database.get(rtxn, &key)? {
Some(original_strings) => original_strings,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
for original in original_strings {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, &original, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
}
if results.len() >= MAX_NUMBER_OF_FACETS {
return Ok(ControlFlow::Break(()));
}
}
Ok(ControlFlow::Continue(()))
}
} }
#[derive(Debug, Clone, serde::Serialize, PartialEq)] #[derive(Debug, Clone, serde::Serialize, PartialEq)]

View File

@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids, script_language_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_normalized_string_strings,
facet_id_string_fst, facet_id_string_fst,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
@ -92,6 +93,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_fid_docids.clear(self.wtxn)?; word_prefix_fid_docids.clear(self.wtxn)?;
script_language_docids.clear(self.wtxn)?; script_language_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?;
facet_id_normalized_string_strings.clear(self.wtxn)?;
facet_id_string_fst.clear(self.wtxn)?; facet_id_string_fst.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?; facet_id_is_null_docids.clear(self.wtxn)?;

View File

@ -236,6 +236,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_fid_docids, word_prefix_fid_docids,
facet_id_f64_docids: _, facet_id_f64_docids: _,
facet_id_string_docids: _, facet_id_string_docids: _,
facet_id_normalized_string_strings: _,
facet_id_string_fst: _, facet_id_string_fst: _,
field_id_docid_facet_f64s: _, field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _, field_id_docid_facet_strings: _,

View File

@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_GROUP_SIZE: u8 = 4;
pub const FACET_MIN_LEVEL_SIZE: u8 = 5; pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File; use std::fs::File;
use std::iter::FromIterator;
use heed::types::DecodeIgnore; use charabia::normalizer::{Normalize, NormalizerOption};
use grenad::{CompressionType, SortAlgorithm};
use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
use heed::BytesEncode;
use log::debug; use log::debug;
use time::OffsetDateTime; use time::OffsetDateTime;
@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec; use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result, BEU16}; use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, BEU16};
pub mod bulk; pub mod bulk;
pub mod delete; pub mod delete;
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
incremental_update.execute(wtxn)?; incremental_update.execute(wtxn)?;
} }
// We clear the list of normalized-for-search facets
// and the previous FSTs to compute everything from scratch
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
self.index.facet_id_string_fst.clear(wtxn)?;
// As we can't use the same write transaction to read and write in two different databases
// we must create a temporary sorter that we will write into LMDB afterward.
// As multiple unnormalized facet values can become the same normalized facet value
// we must merge them together.
let mut sorter = create_sorter(
SortAlgorithm::Unstable,
merge_btreeset_string,
CompressionType::None,
None,
None,
None,
);
// We iterate on the list of original, semi-normalized, facet values
// and normalize them for search, inserting them in LMDB in any given order.
let options = NormalizerOption { lossy: true, ..Default::default() };
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let normalized_facet = left_bound.normalize(&options);
let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
sorter.insert(key, val)?;
}
}
// In this loop we don't need to take care of merging bitmaps
// as the grenad sorter already merged them for us.
let mut merger_iter = sorter.into_stream_merger_iter()?;
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
self.index
.facet_id_normalized_string_strings
.remap_types::<ByteSlice, ByteSlice>()
.put(wtxn, key_bytes, btreeset_bytes)?;
}
// We compute one FST by string facet // We compute one FST by string facet
let mut text_fsts = vec![]; let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None; let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>(); let database =
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? { for result in database.iter(wtxn)? {
let (facet_group_key, _) = result?; let ((field_id, normalized_facet), _) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { current_fst = match current_fst.take() {
current_fst = match current_fst.take() { Some((fid, fst_builder)) if fid != field_id => {
Some((fid, fst_builder)) if fid != field_id => { let fst = fst_builder.into_set();
let fst = fst_builder.into_set(); text_fsts.push((fid, fst));
text_fsts.push((fid, fst)); Some((field_id, fst::SetBuilder::memory()))
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(left_bound)?;
} }
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
} }
} }
@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
text_fsts.push((field_id, fst)); text_fsts.push((field_id, fst));
} }
// We remove all of the previous FSTs that were in this database
self.index.facet_id_string_fst.clear(wtxn)?;
// We write those FSTs in LMDB now // We write those FSTs in LMDB now
for (field_id, fst) in text_fsts { for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?; self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::BTreeSet;
use std::io; use std::io;
use std::result::Result as StdResult; use std::result::Result as StdResult;
@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
} }
} }
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// TODO improve the perf by using a `#[borrow] Cow<str>`.
let strings: BTreeSet<String> = values
.iter()
.map(AsRef::as_ref)
.map(serde_json::from_slice::<BTreeSet<String>>)
.map(StdResult::unwrap)
.reduce(|mut current, new| {
for x in new {
current.insert(x);
}
current
})
.unwrap();
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
}
}
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(values[0].clone()) Ok(values[0].clone())
} }

View File

@ -13,9 +13,9 @@ pub use grenad_helpers::{
GrenadParameters, MergeableReader, GrenadParameters, MergeableReader,
}; };
pub use merge_functions::{ pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap, merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
MergeFn, serialize_roaring_bitmap, MergeFn,
}; };
use crate::MAX_WORD_LENGTH; use crate::MAX_WORD_LENGTH;

View File

@ -26,7 +26,7 @@ pub use self::enrich::{
}; };
pub use self::helpers::{ pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
}; };
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};

View File

@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::bulk::FacetsUpdateBulk;
pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::facet::incremental::FacetsUpdateIncrementalInner;
pub use self::index_documents::{ pub use self::index_documents::{
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
MergeFn,
}; };
pub use self::indexer_config::IndexerConfig; pub use self::indexer_config::IndexerConfig;
pub use self::prefix_word_pairs::{ pub use self::prefix_word_pairs::{