From 467b49153d7d012380ac045ac4ea7da0fa2ab609 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 6 Dec 2023 15:49:02 +0100 Subject: [PATCH] Implement proximityPrecision setting on milli side --- milli/src/index.rs | 24 +++ milli/src/proximity.rs | 10 + milli/src/search/new/db_cache.rs | 195 +++++++++++++----- .../src/update/index_documents/extract/mod.rs | 24 ++- milli/src/update/index_documents/mod.rs | 2 + milli/src/update/settings.rs | 35 +++- 6 files changed, 224 insertions(+), 66 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 800edcbfc..01a01ac37 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,6 +21,7 @@ use crate::heed_codec::facet::{ use crate::heed_codec::{ BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, }; +use crate::proximity::ProximityPrecision; use crate::readable_slices::ReadableSlices; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -72,6 +73,7 @@ pub mod main_key { pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; + pub const PROXIMITY_PRECISION: &str = "proximity-precision"; } pub mod db_name { @@ -1466,6 +1468,28 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } + pub fn proximity_precision(&self, txn: &RoTxn) -> heed::Result> { + self.main + .remap_types::>() + .get(txn, main_key::PROXIMITY_PRECISION) + } + + pub(crate) fn put_proximity_precision( + &self, + txn: &mut RwTxn, + val: ProximityPrecision, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + txn, + main_key::PROXIMITY_PRECISION, + &val, + ) + } + + pub(crate) fn delete_proximity_precision(&self, txn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) + } + /* script language docids */ /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. pub fn script_language_documents_ids( diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 8261015a3..2745527c1 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,5 +1,7 @@ use std::cmp; +use serde::{Deserialize, Serialize}; + use crate::{relative_from_absolute_position, Position}; pub const MAX_DISTANCE: u32 = 4; @@ -25,3 +27,11 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { pub fn path_proximity(path: &[Position]) -> u32 { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub enum ProximityPrecision { + #[default] + WordScale, + AttributeScale, +} diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index b7a74fb62..051e366d0 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -10,6 +10,7 @@ use roaring::RoaringBitmap; use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; +use crate::proximity::ProximityPrecision; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, @@ -263,18 +264,67 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - // TODO: if database is empty, search if the word are in the same attribute instead - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), - ) + match self.index.proximity_precision(self.txn)?.unwrap_or_default() { + ProximityPrecision::AttributeScale => { + // Force proximity to 0 because: + // in AttributeScale, there are only 2 possible distances: + // 1. words in same attribute: in that the DB contains (0, word1, word2) + // 2. words in different attributes: no DB entry for these two words. + let proximity = 0; + let docids = if let Some(docids) = + self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2)) + { + docids + .as_ref() + .map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d)) + .transpose() + .map_err(heed::Error::Decoding)? + } else { + // Compute the distance at the attribute level and store it in the cache. + let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { + fids + } else { + self.index.fields_ids_map(self.txn)?.ids().collect() + }; + let mut docids = RoaringBitmap::new(); + for fid in fids { + // for each field, intersect left word bitmap and right word bitmap, + // then merge the result in a global bitmap before storing it in the cache. + let word1_docids = self.get_db_word_fid_docids(word1, fid)?; + let word2_docids = self.get_db_word_fid_docids(word2, fid)?; + if let (Some(word1_docids), Some(word2_docids)) = + (word1_docids, word2_docids) + { + docids |= word1_docids & word2_docids; + } + } + let encoded = CboRoaringBitmapCodec::bytes_encode(&docids) + .map(Cow::into_owned) + .map(Cow::Owned) + .map(Some) + .map_err(heed::Error::Decoding)?; + self.db_cache + .word_pair_proximity_docids + .insert((proximity, word1, word2), encoded); + Some(docids) + }; + + Ok(docids) + } + ProximityPrecision::WordScale => { + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) + } + } } pub fn get_db_word_pair_proximity_docids_len( @@ -283,56 +333,95 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - // TODO: if database is empty, search if the word are in the same attribute instead - DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( - self.txn, - (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), - ) + match self.index.proximity_precision(self.txn)?.unwrap_or_default() { + ProximityPrecision::AttributeScale => Ok(self + .get_db_word_pair_proximity_docids(word1, word2, proximity)? + .map(|d| d.len())), + ProximityPrecision::WordScale => { + DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) + } + } } pub fn get_db_word_prefix_pair_proximity_docids( &mut self, word1: Interned, prefix2: Interned, - proximity: u8, + mut proximity: u8, ) -> Result> { - // TODO: if database is empty, search if the word are in the same attribute instead - let docids = match self - .db_cache - .word_prefix_pair_proximity_docids - .entry((proximity, word1, prefix2)) - { - Entry::Occupied(docids) => docids.get().clone(), - Entry::Vacant(entry) => { - // compute docids using prefix iter and store the result in the cache. - let key = U8StrStrCodec::bytes_encode(&( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - )) - .unwrap() - .into_owned(); - let mut prefix_docids = RoaringBitmap::new(); - let remap_key_type = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .prefix_iter(self.txn, &key)?; - for result in remap_key_type { - let (_, docids) = result?; + let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::AttributeScale { + // Force proximity to 0 because: + // in AttributeScale, there are only 2 possible distances: + // 1. words in same attribute: in that the DB contains (0, word1, word2) + // 2. words in different attributes: no DB entry for these two words. + proximity = 0; + } - prefix_docids |= docids; + let docids = if let Some(docids) = + self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2)) + { + docids.clone() + } else { + let prefix_docids = match proximity_precision { + ProximityPrecision::AttributeScale => { + // Compute the distance at the attribute level and store it in the cache. + let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { + fids + } else { + self.index.fields_ids_map(self.txn)?.ids().collect() + }; + let mut prefix_docids = RoaringBitmap::new(); + // for each field, intersect left word bitmap and right word bitmap, + // then merge the result in a global bitmap before storing it in the cache. + for fid in fids { + let word1_docids = self.get_db_word_fid_docids(word1, fid)?; + let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; + if let (Some(word1_docids), Some(prefix2_docids)) = + (word1_docids, prefix2_docids) + { + prefix_docids |= word1_docids & prefix2_docids; + } + } + prefix_docids } - entry.insert(Some(prefix_docids.clone())); - Some(prefix_docids) - } + ProximityPrecision::WordScale => { + // compute docids using prefix iter and store the result in the cache. + let key = U8StrStrCodec::bytes_encode(&( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + )) + .unwrap() + .into_owned(); + let mut prefix_docids = RoaringBitmap::new(); + let remap_key_type = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .prefix_iter(self.txn, &key)?; + for result in remap_key_type { + let (_, docids) = result?; + + prefix_docids |= docids; + } + prefix_docids + } + }; + self.db_cache + .word_prefix_pair_proximity_docids + .insert((proximity, word1, prefix2), Some(prefix_docids.clone())); + Some(prefix_docids) }; Ok(docids) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 366e61c04..57f349894 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -32,6 +32,7 @@ use super::helpers::{ MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; +use crate::proximity::ProximityPrecision; use crate::{FieldId, Result}; /// Extract data for each databases from obkv documents in parallel. @@ -52,7 +53,7 @@ pub(crate) fn data_from_obkv_documents( dictionary: Option<&[&str]>, max_positions_per_attributes: Option, exact_attributes: HashSet, - // TODO: add a proximity database deactivation parameter. + proximity_precision: ProximityPrecision, ) -> Result<()> { puffin::profile_function!(); @@ -151,16 +152,17 @@ pub(crate) fn data_from_obkv_documents( }); } - // TODO: Skip this part if deactivated - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_pair_proximity_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", - ); + if proximity_precision == ProximityPrecision::WordScale { + spawn_extraction_task::<_, _, Vec>>>( + docid_word_positions_chunks.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + merge_deladd_cbo_roaring_bitmaps, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + } spawn_extraction_task::<_, _, Vec>>>( docid_word_positions_chunks.clone(), diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be2fbb25e..f825cad1c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -352,6 +352,7 @@ where let dictionary: Option> = dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); let pool_params = GrenadParameters { chunk_compression_type: self.indexer_config.chunk_compression_type, @@ -392,6 +393,7 @@ where dictionary.as_deref(), max_positions_per_attributes, exact_attributes, + proximity_precision, ) }); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0a069c6df..712e595e9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,6 +12,7 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, OrderBy, Result}; @@ -127,7 +128,7 @@ pub struct Settings<'a, 't, 'i> { max_values_per_facet: Setting, sort_facet_values_by: Setting>, pagination_max_total_hits: Setting, - // TODO: add a proximity database deactivation attribute. + proximity_precision: Setting, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -159,6 +160,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { max_values_per_facet: Setting::NotSet, sort_facet_values_by: Setting::NotSet, pagination_max_total_hits: Setting::NotSet, + proximity_precision: Setting::NotSet, indexer_config, } } @@ -333,6 +335,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.pagination_max_total_hits = Setting::Reset; } + pub fn set_proximity_precision(&mut self, value: ProximityPrecision) { + self.proximity_precision = Setting::Set(value); + } + + pub fn reset_proximity_precision(&mut self) { + self.proximity_precision = Setting::Reset; + } + fn reindex( &mut self, progress_callback: &FP, @@ -862,6 +872,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(()) } + fn update_proximity_precision(&mut self) -> Result { + let changed = match self.proximity_precision { + Setting::Set(new) => { + let old = self.index.proximity_precision(self.wtxn)?; + if old == Some(new) { + false + } else { + self.index.put_proximity_precision(self.wtxn, new)?; + true + } + } + Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -898,6 +926,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; let exact_attributes_updated = self.update_exact_attributes()?; + let proximity_precision = self.update_proximity_precision()?; if stop_words_updated || non_separator_tokens_updated @@ -907,7 +936,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { || synonyms_updated || searchable_updated || exact_attributes_updated - // TODO: reindex if proximity database is activated + || proximity_precision { self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; } @@ -1733,6 +1762,7 @@ mod tests { max_values_per_facet, sort_facet_values_by, pagination_max_total_hits, + proximity_precision, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -1754,6 +1784,7 @@ mod tests { assert!(matches!(max_values_per_facet, Setting::NotSet)); assert!(matches!(sort_facet_values_by, Setting::NotSet)); assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + assert!(matches!(proximity_precision, Setting::NotSet)); }) .unwrap(); }