Remove Index::faceted_documents_ids

This commit is contained in:
Louis Dureuil 2023-10-23 14:50:11 +02:00
parent 04ec293024
commit 14832cb324
No known key found for this signature in database
9 changed files with 1 additions and 174 deletions

View File

@ -55,7 +55,6 @@ pub mod main_key {
/// e.g. vector-hnsw0x0032. /// e.g. vector-hnsw0x0032.
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
@ -64,7 +63,6 @@ pub mod main_key {
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens"; pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens"; pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
pub const DICTIONARY_KEY: &str = "dictionary"; pub const DICTIONARY_KEY: &str = "dictionary";
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
pub const SYNONYMS_KEY: &str = "synonyms"; pub const SYNONYMS_KEY: &str = "synonyms";
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms"; pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_FST_KEY: &str = "words-fst";
@ -926,44 +924,6 @@ impl Index {
/* faceted documents ids */ /* faceted documents ids */
/// Writes the documents ids that are faceted under this field id for the given facet type.
pub fn put_faceted_documents_ids(
&self,
wtxn: &mut RwTxn,
field_id: FieldId,
facet_type: FacetType,
docids: &RoaringBitmap,
) -> heed::Result<()> {
let key = match facet_type {
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
};
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
buffer[..key.len()].copy_from_slice(key.as_bytes());
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
/// Retrieve all the documents ids that are faceted under this field id for the given facet type.
pub fn faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<RoaringBitmap> {
let key = match facet_type {
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
};
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
buffer[..key.len()].copy_from_slice(key.as_bytes());
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id set as null /// Retrieve all the documents which contain this field id set as null
pub fn null_faceted_documents_ids( pub fn null_faceted_documents_ids(
&self, &self,

View File

@ -359,31 +359,7 @@ pub fn snap_external_documents_ids(index: &Index) -> String {
snap snap
} }
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let number_faceted_documents_ids =
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let string_faceted_documents_ids =
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_words_fst(index: &Index) -> String { pub fn snap_words_fst(index: &Index) -> String {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let words_fst = index.words_fst(&rtxn).unwrap(); let words_fst = index.words_fst(&rtxn).unwrap();
@ -531,12 +507,6 @@ macro_rules! full_snap_of_db {
($index:ident, external_documents_ids) => {{ ($index:ident, external_documents_ids) => {{
$crate::snapshot_tests::snap_external_documents_ids(&$index) $crate::snapshot_tests::snap_external_documents_ids(&$index)
}}; }};
($index:ident, number_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
}};
($index:ident, string_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
}};
($index:ident, words_fst) => {{ ($index:ident, words_fst) => {{
$crate::snapshot_tests::snap_words_fst(&$index) $crate::snapshot_tests::snap_words_fst(&$index)
}}; }};

View File

@ -64,22 +64,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?; self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::Number,
&empty_roaring,
)?;
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::String,
&empty_roaring,
)?;
}
// Clear the other databases. // Clear the other databases.
word_docids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?;
exact_word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?;

View File

@ -384,12 +384,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
for facet_type in [FacetType::Number, FacetType::String] { for facet_type in [FacetType::Number, FacetType::String] {
let mut affected_facet_values = HashMap::new(); let mut affected_facet_values = HashMap::new();
for field_id in self.index.faceted_fields_ids(self.wtxn)? { for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids =
self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?;
docids -= &self.to_delete_docids;
self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?;
let facet_values = remove_docids_from_field_id_docid_facet_value( let facet_values = remove_docids_from_field_id_docid_facet_value(
self.index, self.index,
self.wtxn, self.wtxn,

View File

@ -23,9 +23,6 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
/// ///
/// First, the new elements are inserted into the level 0 of the database. Then, the /// First, the new elements are inserted into the level 0 of the database. Then, the
/// higher levels are cleared and recomputed from the content of level 0. /// higher levels are cleared and recomputed from the content of level 0.
///
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
/// is updated to contain the new set of faceted documents.
pub struct FacetsUpdateBulk<'i> { pub struct FacetsUpdateBulk<'i> {
index: &'i Index, index: &'i Index,
group_size: u8, group_size: u8,
@ -86,7 +83,7 @@ impl<'i> FacetsUpdateBulk<'i> {
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; // TODO: remove the lambda altogether
Ok(()) Ok(())
})?; })?;
@ -507,7 +504,6 @@ mod tests {
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
} }
#[test] #[test]

View File

@ -160,7 +160,6 @@ mod tests {
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
let mut wtxn = index.env.write_txn().unwrap(); let mut wtxn = index.env.write_txn().unwrap();
@ -178,7 +177,6 @@ mod tests {
db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
} }
// Same test as above but working with string values for the facets // Same test as above but working with string values for the facets
@ -219,7 +217,6 @@ mod tests {
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut wtxn = index.env.write_txn().unwrap(); let mut wtxn = index.env.write_txn().unwrap();
@ -237,7 +234,6 @@ mod tests {
db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
} }
#[test] #[test]
@ -274,7 +270,6 @@ mod tests {
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
@ -291,12 +286,6 @@ mod tests {
db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
db_snap!(index, string_faceted_documents_ids, 2, @r###"
0 []
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
2 [292, 324, 358, 381, 493, 839, 852, ]
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
"###);
} }
} }

View File

@ -30,9 +30,6 @@ enum DeletionResult {
/// Algorithm to incrementally insert and delete elememts into the /// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases. /// `facet_id_(string/f64)_docids` databases.
///
/// Rhe `faceted_documents_ids` value in the main database of `Index`
/// is also updated to contain the new set of faceted documents.
pub struct FacetsUpdateIncremental<'i> { pub struct FacetsUpdateIncremental<'i> {
index: &'i Index, index: &'i Index,
inner: FacetsUpdateIncrementalInner, inner: FacetsUpdateIncrementalInner,
@ -70,29 +67,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
} }
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
#[derive(Default)]
struct DeltaDocids {
deleted: RoaringBitmap,
added: RoaringBitmap,
}
impl DeltaDocids {
fn add(&mut self, added: &RoaringBitmap) {
self.deleted -= added;
self.added |= added;
}
fn delete(&mut self, deleted: &RoaringBitmap) {
self.deleted |= deleted;
self.added -= deleted;
}
fn applied(self, mut docids: RoaringBitmap) -> RoaringBitmap {
docids -= self.deleted;
docids |= self.added;
docids
}
}
let mut new_faceted_docids = HashMap::<FieldId, DeltaDocids>::default();
let mut cursor = self.delta_data.into_cursor()?; let mut cursor = self.delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) { if !valid_lmdb_key(key) {
@ -102,8 +76,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
.ok_or(heed::Error::Encoding)?; .ok_or(heed::Error::Encoding)?;
let value = KvReader::new(value); let value = KvReader::new(value);
let entry = new_faceted_docids.entry(key.field_id).or_default();
let docids_to_delete = value let docids_to_delete = value
.get(DelAdd::Deletion) .get(DelAdd::Deletion)
.map(CboRoaringBitmapCodec::bytes_decode) .map(CboRoaringBitmapCodec::bytes_decode)
@ -117,31 +89,14 @@ impl<'i> FacetsUpdateIncremental<'i> {
if let Some(docids_to_delete) = docids_to_delete { if let Some(docids_to_delete) = docids_to_delete {
let docids_to_delete = docids_to_delete?; let docids_to_delete = docids_to_delete?;
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
entry.delete(&docids_to_delete);
} }
if let Some(docids_to_add) = docids_to_add { if let Some(docids_to_add) = docids_to_add {
let docids_to_add = docids_to_add?; let docids_to_add = docids_to_add?;
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
entry.add(&docids_to_add);
} }
} }
// FIXME: broken for multi-value facets?
//
// Consider an incremental update: `facet="tags", facet_value="Action", {Del: Some([0, 1]), Add: None }`
// The current code will inconditionally remove docs 0 and 1 from faceted docs for "tags".
// Now for doc 0: `"tags": "Action"`, it's correct behavior
// for doc 1: `"tags": "Action, Adventure"`, it's incorrect behavior
for (field_id, new_docids) in new_faceted_docids {
let old_docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
self.index.put_faceted_documents_ids(
wtxn,
field_id,
self.facet_type,
&new_docids.applied(old_docids),
)?;
}
Ok(()) Ok(())
} }
} }

View File

@ -599,7 +599,6 @@ mod tests {
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
let mut documents = vec![]; let mut documents = vec![];
@ -622,7 +621,6 @@ mod tests {
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
// Then replace the last document while disabling soft_deletion // Then replace the last document while disabling soft_deletion
@ -647,7 +645,6 @@ mod tests {
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
} }
} }

View File

@ -1499,12 +1499,6 @@ mod tests {
3 2 second second 3 2 second second
3 3 third third 3 3 third third
"###); "###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
@ -1528,12 +1522,6 @@ mod tests {
db_snap!(index, facet_id_string_docids, @""); db_snap!(index, facet_id_string_docids, @"");
db_snap!(index, field_id_docid_facet_strings, @""); db_snap!(index, field_id_docid_facet_strings, @"");
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
@ -1560,12 +1548,6 @@ mod tests {
3 2 second second 3 2 second second
3 3 third third 3 3 third third
"###); "###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();