Implement proximityPrecision setting on milli side

This commit is contained in:
ManyTheFish 2023-12-06 15:49:02 +01:00
parent 0c3fa8cbc4
commit 467b49153d
6 changed files with 224 additions and 66 deletions

View File

@ -21,6 +21,7 @@ use crate::heed_codec::facet::{
use crate::heed_codec::{ use crate::heed_codec::{
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
}; };
use crate::proximity::ProximityPrecision;
use crate::readable_slices::ReadableSlices; use crate::readable_slices::ReadableSlices;
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -72,6 +73,7 @@ pub mod main_key {
pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet";
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
} }
pub mod db_name { pub mod db_name {
@ -1466,6 +1468,28 @@ impl Index {
self.main.remap_key_type::<Str>().delete(txn, main_key::PAGINATION_MAX_TOTAL_HITS) self.main.remap_key_type::<Str>().delete(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
} }
pub fn proximity_precision(&self, txn: &RoTxn) -> heed::Result<Option<ProximityPrecision>> {
self.main
.remap_types::<Str, SerdeBincode<ProximityPrecision>>()
.get(txn, main_key::PROXIMITY_PRECISION)
}
pub(crate) fn put_proximity_precision(
&self,
txn: &mut RwTxn,
val: ProximityPrecision,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeBincode<ProximityPrecision>>().put(
txn,
main_key::PROXIMITY_PRECISION,
&val,
)
}
pub(crate) fn delete_proximity_precision(&self, txn: &mut RwTxn) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION)
}
/* script language docids */ /* script language docids */
/// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
pub fn script_language_documents_ids( pub fn script_language_documents_ids(

View File

@ -1,5 +1,7 @@
use std::cmp; use std::cmp;
use serde::{Deserialize, Serialize};
use crate::{relative_from_absolute_position, Position}; use crate::{relative_from_absolute_position, Position};
pub const MAX_DISTANCE: u32 = 4; pub const MAX_DISTANCE: u32 = 4;
@ -25,3 +27,11 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
pub fn path_proximity(path: &[Position]) -> u32 { pub fn path_proximity(path: &[Position]) -> u32 {
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>() path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
} }
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub enum ProximityPrecision {
#[default]
WordScale,
AttributeScale,
}

View File

@ -10,6 +10,7 @@ use roaring::RoaringBitmap;
use super::interner::Interned; use super::interner::Interned;
use super::Word; use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::proximity::ProximityPrecision;
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{ use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
@ -263,18 +264,67 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
// TODO: if database is empty, search if the word are in the same attribute instead match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( ProximityPrecision::AttributeScale => {
self.txn, // Force proximity to 0 because:
(proximity, word1, word2), // in AttributeScale, there are only 2 possible distances:
&( // 1. words in same attribute: in that the DB contains (0, word1, word2)
proximity, // 2. words in different attributes: no DB entry for these two words.
self.word_interner.get(word1).as_str(), let proximity = 0;
self.word_interner.get(word2).as_str(), let docids = if let Some(docids) =
), self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2))
&mut self.db_cache.word_pair_proximity_docids, {
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), docids
) .as_ref()
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
.transpose()
.map_err(heed::Error::Decoding)?
} else {
// Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut docids = RoaringBitmap::new();
for fid in fids {
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
let word2_docids = self.get_db_word_fid_docids(word2, fid)?;
if let (Some(word1_docids), Some(word2_docids)) =
(word1_docids, word2_docids)
{
docids |= word1_docids & word2_docids;
}
}
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
.map(Cow::into_owned)
.map(Cow::Owned)
.map(Some)
.map_err(heed::Error::Decoding)?;
self.db_cache
.word_pair_proximity_docids
.insert((proximity, word1, word2), encoded);
Some(docids)
};
Ok(docids)
}
ProximityPrecision::WordScale => {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
)
}
}
} }
pub fn get_db_word_pair_proximity_docids_len( pub fn get_db_word_pair_proximity_docids_len(
@ -283,56 +333,95 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<u64>> { ) -> Result<Option<u64>> {
// TODO: if database is empty, search if the word are in the same attribute instead match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( ProximityPrecision::AttributeScale => Ok(self
self.txn, .get_db_word_pair_proximity_docids(word1, word2, proximity)?
(proximity, word1, word2), .map(|d| d.len())),
&( ProximityPrecision::WordScale => {
proximity, DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
self.word_interner.get(word1).as_str(), self.txn,
self.word_interner.get(word2).as_str(), (proximity, word1, word2),
), &(
&mut self.db_cache.word_pair_proximity_docids, proximity,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), self.word_interner.get(word1).as_str(),
) self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
)
}
}
} }
pub fn get_db_word_prefix_pair_proximity_docids( pub fn get_db_word_prefix_pair_proximity_docids(
&mut self, &mut self,
word1: Interned<String>, word1: Interned<String>,
prefix2: Interned<String>, prefix2: Interned<String>,
proximity: u8, mut proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
// TODO: if database is empty, search if the word are in the same attribute instead let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default();
let docids = match self if proximity_precision == ProximityPrecision::AttributeScale {
.db_cache // Force proximity to 0 because:
.word_prefix_pair_proximity_docids // in AttributeScale, there are only 2 possible distances:
.entry((proximity, word1, prefix2)) // 1. words in same attribute: in that the DB contains (0, word1, word2)
{ // 2. words in different attributes: no DB entry for these two words.
Entry::Occupied(docids) => docids.get().clone(), proximity = 0;
Entry::Vacant(entry) => { }
// compute docids using prefix iter and store the result in the cache.
let key = U8StrStrCodec::bytes_encode(&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
))
.unwrap()
.into_owned();
let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
prefix_docids |= docids; let docids = if let Some(docids) =
self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2))
{
docids.clone()
} else {
let prefix_docids = match proximity_precision {
ProximityPrecision::AttributeScale => {
// Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut prefix_docids = RoaringBitmap::new();
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
for fid in fids {
let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?;
if let (Some(word1_docids), Some(prefix2_docids)) =
(word1_docids, prefix2_docids)
{
prefix_docids |= word1_docids & prefix2_docids;
}
}
prefix_docids
} }
entry.insert(Some(prefix_docids.clone())); ProximityPrecision::WordScale => {
Some(prefix_docids) // compute docids using prefix iter and store the result in the cache.
} let key = U8StrStrCodec::bytes_encode(&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
))
.unwrap()
.into_owned();
let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
prefix_docids |= docids;
}
prefix_docids
}
};
self.db_cache
.word_prefix_pair_proximity_docids
.insert((proximity, word1, prefix2), Some(prefix_docids.clone()));
Some(prefix_docids)
}; };
Ok(docids) Ok(docids)
} }

View File

@ -32,6 +32,7 @@ use super::helpers::{
MergeFn, MergeableReader, MergeFn, MergeableReader,
}; };
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::{FieldId, Result}; use crate::{FieldId, Result};
/// Extract data for each databases from obkv documents in parallel. /// Extract data for each databases from obkv documents in parallel.
@ -52,7 +53,7 @@ pub(crate) fn data_from_obkv_documents(
dictionary: Option<&[&str]>, dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>, exact_attributes: HashSet<FieldId>,
// TODO: add a proximity database deactivation parameter. proximity_precision: ProximityPrecision,
) -> Result<()> { ) -> Result<()> {
puffin::profile_function!(); puffin::profile_function!();
@ -151,16 +152,17 @@ pub(crate) fn data_from_obkv_documents(
}); });
} }
// TODO: Skip this part if deactivated if proximity_precision == ProximityPrecision::WordScale {
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids, extract_word_pair_proximity_docids,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPairProximityDocids, TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids", "word-pair-proximity-docids",
); );
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),

View File

@ -352,6 +352,7 @@ where
let dictionary: Option<Vec<_>> = let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_type: self.indexer_config.chunk_compression_type,
@ -392,6 +393,7 @@ where
dictionary.as_deref(), dictionary.as_deref(),
max_positions_per_attributes, max_positions_per_attributes,
exact_attributes, exact_attributes,
proximity_precision,
) )
}); });

View File

@ -12,6 +12,7 @@ use super::IndexerConfig;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::error::UserError; use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod; use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::{FieldsIdsMap, Index, OrderBy, Result}; use crate::{FieldsIdsMap, Index, OrderBy, Result};
@ -127,7 +128,7 @@ pub struct Settings<'a, 't, 'i> {
max_values_per_facet: Setting<usize>, max_values_per_facet: Setting<usize>,
sort_facet_values_by: Setting<HashMap<String, OrderBy>>, sort_facet_values_by: Setting<HashMap<String, OrderBy>>,
pagination_max_total_hits: Setting<usize>, pagination_max_total_hits: Setting<usize>,
// TODO: add a proximity database deactivation attribute. proximity_precision: Setting<ProximityPrecision>,
} }
impl<'a, 't, 'i> Settings<'a, 't, 'i> { impl<'a, 't, 'i> Settings<'a, 't, 'i> {
@ -159,6 +160,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
max_values_per_facet: Setting::NotSet, max_values_per_facet: Setting::NotSet,
sort_facet_values_by: Setting::NotSet, sort_facet_values_by: Setting::NotSet,
pagination_max_total_hits: Setting::NotSet, pagination_max_total_hits: Setting::NotSet,
proximity_precision: Setting::NotSet,
indexer_config, indexer_config,
} }
} }
@ -333,6 +335,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.pagination_max_total_hits = Setting::Reset; self.pagination_max_total_hits = Setting::Reset;
} }
pub fn set_proximity_precision(&mut self, value: ProximityPrecision) {
self.proximity_precision = Setting::Set(value);
}
pub fn reset_proximity_precision(&mut self) {
self.proximity_precision = Setting::Reset;
}
fn reindex<FP, FA>( fn reindex<FP, FA>(
&mut self, &mut self,
progress_callback: &FP, progress_callback: &FP,
@ -862,6 +872,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(()) Ok(())
} }
fn update_proximity_precision(&mut self) -> Result<bool> {
let changed = match self.proximity_precision {
Setting::Set(new) => {
let old = self.index.proximity_precision(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_proximity_precision(self.wtxn, new)?;
true
}
}
Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()> pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
where where
FP: Fn(UpdateIndexingStep) + Sync, FP: Fn(UpdateIndexingStep) + Sync,
@ -898,6 +926,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
let synonyms_updated = self.update_synonyms()?; let synonyms_updated = self.update_synonyms()?;
let searchable_updated = self.update_searchable()?; let searchable_updated = self.update_searchable()?;
let exact_attributes_updated = self.update_exact_attributes()?; let exact_attributes_updated = self.update_exact_attributes()?;
let proximity_precision = self.update_proximity_precision()?;
if stop_words_updated if stop_words_updated
|| non_separator_tokens_updated || non_separator_tokens_updated
@ -907,7 +936,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|| synonyms_updated || synonyms_updated
|| searchable_updated || searchable_updated
|| exact_attributes_updated || exact_attributes_updated
// TODO: reindex if proximity database is activated || proximity_precision
{ {
self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
} }
@ -1733,6 +1762,7 @@ mod tests {
max_values_per_facet, max_values_per_facet,
sort_facet_values_by, sort_facet_values_by,
pagination_max_total_hits, pagination_max_total_hits,
proximity_precision,
} = settings; } = settings;
assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet));
@ -1754,6 +1784,7 @@ mod tests {
assert!(matches!(max_values_per_facet, Setting::NotSet)); assert!(matches!(max_values_per_facet, Setting::NotSet));
assert!(matches!(sort_facet_values_by, Setting::NotSet)); assert!(matches!(sort_facet_values_by, Setting::NotSet));
assert!(matches!(pagination_max_total_hits, Setting::NotSet)); assert!(matches!(pagination_max_total_hits, Setting::NotSet));
assert!(matches!(proximity_precision, Setting::NotSet));
}) })
.unwrap(); .unwrap();
} }