diff --git a/milli/src/heed_codec/beu16_str_codec.rs b/milli/src/heed_codec/beu16_str_codec.rs new file mode 100644 index 000000000..d1b85d47f --- /dev/null +++ b/milli/src/heed_codec/beu16_str_codec.rs @@ -0,0 +1,27 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +pub struct BEU16StrCodec; + +impl<'a> heed::BytesDecode<'a> for BEU16StrCodec { + type DItem = (u16, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n_bytes, str_bytes) = bytes.split_at(2); + let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?; + let s = str::from_utf8(str_bytes).ok()?; + Some((n, s)) + } +} + +impl<'a> heed::BytesEncode<'a> for BEU16StrCodec { + type EItem = (u16, &'a str); + + fn bytes_encode((n, s): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s.len() + 2); + bytes.extend_from_slice(&n.to_be_bytes()); + bytes.extend_from_slice(s.as_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 666f68e28..d04eaa644 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,3 +1,4 @@ +mod beu16_str_codec; mod beu32_str_codec; mod byte_slice_ref; pub mod facet; @@ -14,6 +15,7 @@ mod str_str_u8_codec; pub use byte_slice_ref::ByteSliceRefCodec; pub use str_ref::StrRefCodec; +pub use self::beu16_str_codec::BEU16StrCodec; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::fst_set_codec::FstSetCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index 392ed1705..544047293 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::fs::File; use std::mem::size_of; use std::path::Path; @@ -21,7 +21,9 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; +use crate::heed_codec::{ + BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, +}; use crate::readable_slices::ReadableSlices; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -96,6 +98,7 @@ pub mod db_name { pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; + pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings"; pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -157,6 +160,8 @@ pub struct Index { pub facet_id_f64_docids: Database, FacetGroupValueCodec>, /// Maps the facet field id and ranges of strings with the docids that corresponds to them. pub facet_id_string_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id of the normalized-for-search string facets with their original versions. + pub facet_id_normalized_string_strings: Database>>, /// Maps the facet field id of the string facets with an FST containing all the facets values. pub facet_id_string_fst: Database, FstSetCodec>, @@ -181,7 +186,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(24); + options.max_dbs(25); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -211,6 +216,8 @@ impl Index { let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_normalized_string_strings = + env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?; let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?; let facet_id_exists_docids = env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?; @@ -246,6 +253,7 @@ impl Index { field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_normalized_string_strings, facet_id_string_fst, facet_id_exists_docids, facet_id_is_null_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 55b283931..3e5f63fd5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -51,9 +51,10 @@ pub use self::error::{ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ - BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, - CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, + BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, + RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, + UncheckedU8StrStrCodec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 65e78caa9..fe34b38e1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -339,11 +339,12 @@ impl<'a> SearchForFacetValues<'a> { let mut stream = fst.search(automaton).into_stream(); let mut length = 0; - while let Some(facet_value) = stream.next() { + 'outer: while let Some(facet_value) = stream.next() { let value = std::str::from_utf8(facet_value)?; - let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value }; - let docids = match index.facet_id_string_docids.get(rtxn, &key)? { - Some(FacetGroupValue { bitmap, .. }) => bitmap, + let database = index.facet_id_normalized_string_strings; + let key = (fid, value); + let original_strings = match database.get(rtxn, &key)? { + Some(original_strings) => original_strings, None => { error!( "the facet value is missing from the facet database: {key:?}" @@ -351,16 +352,36 @@ impl<'a> SearchForFacetValues<'a> { continue; } }; - let count = search_candidates.intersection_len(&docids); - if count != 0 { - let value = self - .one_original_value_of(fid, value, docids.min().unwrap())? - .unwrap_or_else(|| query.to_string()); - results.push(FacetValueHit { value, count }); - length += 1; - } - if length >= MAX_NUMBER_OF_FACETS { - break; + for original_string in original_strings { + let key = FacetGroupKey { + field_id: fid, + level: 0, + left_bound: original_string.as_str(), + }; + let docids = match index.facet_id_string_docids.get(rtxn, &key)? { + Some(FacetGroupValue { bitmap, .. }) => bitmap, + None => { + error!( + "the facet value is missing from the facet database: {key:?}" + ); + continue; + } + }; + let count = search_candidates.intersection_len(&docids); + if count != 0 { + let value = self + .one_original_value_of( + fid, + &original_string, + docids.min().unwrap(), + )? + .unwrap_or_else(|| query.to_string()); + results.push(FacetValueHit { value, count }); + length += 1; + } + if length >= MAX_NUMBER_OF_FACETS { + break 'outer; + } } } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5fdf8ef49..2d2f78d5e 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { script_language_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_normalized_string_strings, facet_id_string_fst, facet_id_exists_docids, facet_id_is_null_docids, @@ -92,6 +93,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_fid_docids.clear(self.wtxn)?; script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; + facet_id_normalized_string_strings.clear(self.wtxn)?; facet_id_string_fst.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; facet_id_is_null_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c9124e591..a412291ab 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_fid_docids, facet_id_f64_docids: _, facet_id_string_docids: _, + facet_id_normalized_string_strings: _, facet_id_string_fst: _, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 0e6fd494c..16fc1cd2f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; +use std::collections::BTreeSet; use std::fs::File; +use std::iter::FromIterator; -use heed::types::DecodeIgnore; +use charabia::normalizer::{Normalize, NormalizerOption}; +use grenad::{CompressionType, SortAlgorithm}; +use heed::types::{ByteSlice, DecodeIgnore, SerdeJson}; +use heed::BytesEncode; use log::debug; use time::OffsetDateTime; @@ -87,7 +92,9 @@ use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::ByteSliceRefCodec; -use crate::{Index, Result, BEU16}; +use crate::update::index_documents::create_sorter; +use crate::update::merge_btreeset_string; +use crate::{BEU16StrCodec, Index, Result, BEU16}; pub mod bulk; pub mod delete; @@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } + // We clear the list of normalized-for-search facets + // and the previous FSTs to compute everything from scratch + self.index.facet_id_normalized_string_strings.clear(wtxn)?; + self.index.facet_id_string_fst.clear(wtxn)?; + + // As we can't use the same write transaction to read and write in two different databases + // we must create a temporary sorter that we will write into LMDB afterward. + // As multiple unnormalized facet values can become the same normalized facet value + // we must merge them together. + let mut sorter = create_sorter( + SortAlgorithm::Unstable, + merge_btreeset_string, + CompressionType::None, + None, + None, + None, + ); + + // We iterate on the list of original, semi-normalized, facet values + // and normalize them for search, inserting them in LMDB in any given order. + let options = NormalizerOption { lossy: true, ..Default::default() }; + let database = self.index.facet_id_string_docids.remap_data_type::(); + for result in database.iter(wtxn)? { + let (facet_group_key, ()) = result?; + if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { + let normalized_facet = left_bound.normalize(&options); + let set = BTreeSet::from_iter(std::iter::once(left_bound)); + let key = (field_id, normalized_facet.as_ref()); + let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?; + let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?; + sorter.insert(key, val)?; + } + } + + // In this loop we don't need to take care of merging bitmaps + // as the grenad sorter already merged them for us. + let mut merger_iter = sorter.into_stream_merger_iter()?; + while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? { + self.index + .facet_id_normalized_string_strings + .remap_types::() + .put(wtxn, key_bytes, btreeset_bytes)?; + } + // We compute one FST by string facet let mut text_fsts = vec![]; let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; - let database = self.index.facet_id_string_docids.remap_data_type::(); + let database = + self.index.facet_id_normalized_string_strings.remap_data_type::(); for result in database.iter(wtxn)? { - let (facet_group_key, _) = result?; - if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { - current_fst = match current_fst.take() { - Some((fid, fst_builder)) if fid != field_id => { - let fst = fst_builder.into_set(); - text_fsts.push((fid, fst)); - Some((field_id, fst::SetBuilder::memory())) - } - Some((field_id, fst_builder)) => Some((field_id, fst_builder)), - None => Some((field_id, fst::SetBuilder::memory())), - }; - - if let Some((_, fst_builder)) = current_fst.as_mut() { - fst_builder.insert(left_bound)?; + let ((field_id, normalized_facet), _) = result?; + current_fst = match current_fst.take() { + Some((fid, fst_builder)) if fid != field_id => { + let fst = fst_builder.into_set(); + text_fsts.push((fid, fst)); + Some((field_id, fst::SetBuilder::memory())) } + Some((field_id, fst_builder)) => Some((field_id, fst_builder)), + None => Some((field_id, fst::SetBuilder::memory())), + }; + + if let Some((_, fst_builder)) = current_fst.as_mut() { + fst_builder.insert(normalized_facet)?; } } @@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> { text_fsts.push((field_id, fst)); } - // We remove all of the previous FSTs that were in this database - self.index.facet_id_string_fst.clear(wtxn)?; - // We write those FSTs in LMDB now for (field_id, fst) in text_fsts { self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?; diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 64bee95df..5d111067a 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::BTreeSet; use std::io; use std::result::Result as StdResult; @@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } +pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // TODO improve the perf by using a `#[borrow] Cow`. + let strings: BTreeSet = values + .iter() + .map(AsRef::as_ref) + .map(serde_json::from_slice::>) + .map(StdResult::unwrap) + .reduce(|mut current, new| { + for x in new { + current.insert(x); + } + current + }) + .unwrap(); + Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap())) + } +} + pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 95e497af4..d59a3bc08 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,9 +13,9 @@ pub use grenad_helpers::{ GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, - merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap, - MergeFn, + concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, + merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, + serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 20a36237e..849e84035 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -26,7 +26,7 @@ pub use self::enrich::{ }; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 32584825b..9982957e5 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ - merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, - IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, + merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + MergeFn, }; pub use self::indexer_config::IndexerConfig; pub use self::prefix_word_pairs::{