2022-10-12 18:32:33 +08:00
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
|
2022-09-21 21:53:39 +08:00
|
|
|
use heed::RwTxn;
|
2022-10-12 18:32:33 +08:00
|
|
|
use log::debug;
|
2022-09-21 21:53:39 +08:00
|
|
|
use roaring::RoaringBitmap;
|
2022-10-12 18:32:33 +08:00
|
|
|
use time::OffsetDateTime;
|
|
|
|
|
|
|
|
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
|
|
|
use crate::facet::FacetType;
|
|
|
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
|
|
|
use crate::heed_codec::ByteSliceRefCodec;
|
|
|
|
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
|
|
|
|
use crate::{FieldId, Index, Result};
|
2022-09-21 21:53:39 +08:00
|
|
|
|
2022-10-12 18:32:33 +08:00
|
|
|
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
|
|
|
///
|
|
|
|
/// Depending on the number of removed elements and the existing size of the database, we use either
|
|
|
|
/// a bulk delete method or an incremental delete method.
|
2022-09-21 21:53:39 +08:00
|
|
|
pub struct FacetsDelete<'i, 'b> {
|
|
|
|
index: &'i Index,
|
2022-10-12 15:42:55 +08:00
|
|
|
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
2022-09-21 21:53:39 +08:00
|
|
|
facet_type: FacetType,
|
|
|
|
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
|
|
|
docids_to_delete: &'b RoaringBitmap,
|
|
|
|
group_size: u8,
|
|
|
|
max_group_size: u8,
|
|
|
|
min_level_size: u8,
|
|
|
|
}
|
|
|
|
impl<'i, 'b> FacetsDelete<'i, 'b> {
|
|
|
|
pub fn new(
|
|
|
|
index: &'i Index,
|
|
|
|
facet_type: FacetType,
|
|
|
|
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
|
|
|
docids_to_delete: &'b RoaringBitmap,
|
|
|
|
) -> Self {
|
|
|
|
let database = match facet_type {
|
2022-10-12 15:42:55 +08:00
|
|
|
FacetType::String => index
|
|
|
|
.facet_id_string_docids
|
|
|
|
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
2022-09-21 21:53:39 +08:00
|
|
|
FacetType::Number => {
|
2022-10-12 15:42:55 +08:00
|
|
|
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
|
2022-09-21 21:53:39 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
Self {
|
|
|
|
index,
|
|
|
|
database,
|
|
|
|
facet_type,
|
|
|
|
affected_facet_values,
|
|
|
|
docids_to_delete,
|
|
|
|
group_size: FACET_GROUP_SIZE,
|
|
|
|
max_group_size: FACET_MAX_GROUP_SIZE,
|
|
|
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
|
2022-10-12 18:32:33 +08:00
|
|
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
|
|
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
|
|
|
|
2022-09-21 21:53:39 +08:00
|
|
|
for (field_id, affected_facet_values) in self.affected_facet_values {
|
2022-10-12 18:32:33 +08:00
|
|
|
// This is an incorrect condition, since we assume that the length of the database is equal
|
|
|
|
// to the number of facet values for the given field_id. It means that in some cases, we might
|
|
|
|
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
|
|
|
|
// really be a performance problem is when we fully delete a large ratio of all facet values for
|
|
|
|
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
|
|
|
|
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
|
|
|
|
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
|
|
|
|
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
|
2022-09-21 21:53:39 +08:00
|
|
|
// Bulk delete
|
|
|
|
let mut modified = false;
|
|
|
|
|
|
|
|
for facet_value in affected_facet_values {
|
|
|
|
let key =
|
|
|
|
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
|
|
|
|
let mut old = self.database.get(wtxn, &key)?.unwrap();
|
|
|
|
let previous_len = old.bitmap.len();
|
|
|
|
old.bitmap -= self.docids_to_delete;
|
|
|
|
if old.bitmap.is_empty() {
|
|
|
|
modified = true;
|
|
|
|
self.database.delete(wtxn, &key)?;
|
|
|
|
} else if old.bitmap.len() != previous_len {
|
|
|
|
modified = true;
|
|
|
|
self.database.put(wtxn, &key, &old)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if modified {
|
|
|
|
let builder = FacetsUpdateBulk::new_not_updating_level_0(
|
|
|
|
self.index,
|
|
|
|
vec![field_id],
|
|
|
|
self.facet_type,
|
|
|
|
);
|
|
|
|
builder.execute(wtxn)?;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Incremental
|
|
|
|
let inc = FacetsUpdateIncrementalInner {
|
|
|
|
db: self.database,
|
|
|
|
group_size: self.group_size,
|
|
|
|
min_level_size: self.min_level_size,
|
|
|
|
max_group_size: self.max_group_size,
|
|
|
|
};
|
|
|
|
for facet_value in affected_facet_values {
|
2022-10-27 22:58:13 +08:00
|
|
|
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
|
2022-09-21 21:53:39 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
2022-10-12 18:32:33 +08:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use std::iter::FromIterator;
|
|
|
|
|
|
|
|
use big_s::S;
|
|
|
|
use maplit::hashset;
|
2022-12-05 17:33:31 +08:00
|
|
|
use rand::seq::SliceRandom;
|
|
|
|
use rand::SeedableRng;
|
2022-10-12 18:32:33 +08:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
|
|
|
use crate::db_snap;
|
|
|
|
use crate::documents::documents_batch_reader_from_objects;
|
|
|
|
use crate::index::tests::TempIndex;
|
2022-12-05 17:33:31 +08:00
|
|
|
use crate::update::facet::test_helpers::ordered_string;
|
2022-12-19 16:47:29 +08:00
|
|
|
use crate::update::{DeleteDocuments, DeletionStrategy};
|
2022-10-12 18:32:33 +08:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn delete_mixed_incremental_and_bulk() {
|
|
|
|
// The point of this test is to create an index populated with documents
|
|
|
|
// containing different filterable attributes. Then, we delete a bunch of documents
|
|
|
|
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
|
|
|
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|settings| {
|
|
|
|
settings.set_filterable_fields(
|
|
|
|
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
|
|
);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
let mut documents = vec![];
|
|
|
|
for i in 0..1000 {
|
|
|
|
documents.push(
|
|
|
|
serde_json::json! {
|
|
|
|
{
|
|
|
|
"id": i,
|
|
|
|
"label": i / 10,
|
|
|
|
"colour": i / 100,
|
|
|
|
"timestamp": i / 2,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
.as_object()
|
|
|
|
.unwrap()
|
|
|
|
.clone(),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
let documents = documents_batch_reader_from_objects(documents);
|
|
|
|
index.add_documents(documents).unwrap();
|
|
|
|
|
2022-12-05 17:33:31 +08:00
|
|
|
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
|
|
|
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
2022-10-12 18:32:33 +08:00
|
|
|
|
|
|
|
let mut wtxn = index.env.write_txn().unwrap();
|
|
|
|
|
|
|
|
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
2022-12-19 16:47:29 +08:00
|
|
|
builder.strategy(DeletionStrategy::AlwaysHard);
|
2022-10-12 18:32:33 +08:00
|
|
|
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
|
|
|
// by deleting the first 100 documents, we expect that:
|
|
|
|
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
|
|
|
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
|
|
|
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
|
|
|
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
|
|
|
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
|
|
|
builder.execute().unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
2022-12-05 17:33:31 +08:00
|
|
|
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
|
|
|
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same test as above but working with string values for the facets
|
|
|
|
#[test]
|
|
|
|
fn delete_mixed_incremental_and_bulk_string() {
|
|
|
|
// The point of this test is to create an index populated with documents
|
|
|
|
// containing different filterable attributes. Then, we delete a bunch of documents
|
|
|
|
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
|
|
|
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|settings| {
|
|
|
|
settings.set_filterable_fields(
|
|
|
|
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
|
|
);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
let mut documents = vec![];
|
|
|
|
for i in 0..1000 {
|
|
|
|
documents.push(
|
|
|
|
serde_json::json! {
|
|
|
|
{
|
|
|
|
"id": i,
|
|
|
|
"label": ordered_string(i / 10),
|
|
|
|
"colour": ordered_string(i / 100),
|
|
|
|
"timestamp": ordered_string(i / 2),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
.as_object()
|
|
|
|
.unwrap()
|
|
|
|
.clone(),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
let documents = documents_batch_reader_from_objects(documents);
|
|
|
|
index.add_documents(documents).unwrap();
|
|
|
|
|
|
|
|
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
|
|
|
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
|
|
|
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
|
|
|
|
|
|
|
let mut wtxn = index.env.write_txn().unwrap();
|
|
|
|
|
|
|
|
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
2022-12-19 16:47:29 +08:00
|
|
|
builder.strategy(DeletionStrategy::AlwaysHard);
|
2022-12-05 17:33:31 +08:00
|
|
|
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
|
|
|
// by deleting the first 100 documents, we expect that:
|
|
|
|
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
|
|
|
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
|
|
|
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
|
|
|
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
|
|
|
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
|
|
|
builder.execute().unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
|
|
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
|
|
|
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn delete_almost_all_incrementally_string() {
|
|
|
|
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|settings| {
|
|
|
|
settings.set_filterable_fields(
|
|
|
|
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
|
|
);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
let mut documents = vec![];
|
|
|
|
for i in 0..1000 {
|
|
|
|
documents.push(
|
|
|
|
serde_json::json! {
|
|
|
|
{
|
|
|
|
"id": i,
|
|
|
|
"label": ordered_string(i / 10),
|
|
|
|
"colour": ordered_string(i / 100),
|
|
|
|
"timestamp": ordered_string(i / 2),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
.as_object()
|
|
|
|
.unwrap()
|
|
|
|
.clone(),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
let documents = documents_batch_reader_from_objects(documents);
|
|
|
|
index.add_documents(documents).unwrap();
|
|
|
|
|
|
|
|
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
|
|
|
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
|
|
|
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
|
|
|
|
|
|
|
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
|
|
|
|
|
|
|
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
|
|
|
|
docids_to_delete.shuffle(&mut rng);
|
|
|
|
for docid in docids_to_delete.into_iter().take(990) {
|
|
|
|
let mut wtxn = index.env.write_txn().unwrap();
|
|
|
|
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
2022-12-19 16:47:29 +08:00
|
|
|
builder.strategy(DeletionStrategy::AlwaysHard);
|
2022-12-05 17:33:31 +08:00
|
|
|
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
|
|
|
|
builder.execute().unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
|
|
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
|
|
|
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
|
|
|
0 []
|
|
|
|
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
|
|
|
2 [292, 324, 358, 381, 493, 839, 852, ]
|
|
|
|
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
|
|
|
"###);
|
2022-10-12 18:32:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[allow(unused)]
|
|
|
|
#[cfg(test)]
|
|
|
|
mod comparison_bench {
|
|
|
|
use std::iter::once;
|
|
|
|
|
|
|
|
use rand::Rng;
|
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
|
|
|
use crate::heed_codec::facet::OrderedF64Codec;
|
2022-12-05 17:33:31 +08:00
|
|
|
use crate::update::facet::test_helpers::FacetIndex;
|
2022-10-12 18:32:33 +08:00
|
|
|
|
|
|
|
// This is a simple test to get an intuition on the relative speed
|
|
|
|
// of the incremental vs. bulk indexer.
|
|
|
|
//
|
|
|
|
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
|
|
|
// each facet value contains only one document ID.
|
|
|
|
//
|
|
|
|
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
|
|
|
|
// bulk indexer.
|
|
|
|
// #[test]
|
|
|
|
fn benchmark_facet_indexing_delete() {
|
|
|
|
let mut r = rand::thread_rng();
|
|
|
|
|
|
|
|
for i in 1..=20 {
|
|
|
|
let size = 50_000 * i;
|
|
|
|
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
|
|
|
|
|
|
|
let mut txn = index.env.write_txn().unwrap();
|
|
|
|
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
|
|
|
for i in 0..size {
|
|
|
|
// field id = 0, left_bound = i, docids = [i]
|
|
|
|
elements.push(((0, i as f64), once(i).collect()));
|
|
|
|
}
|
|
|
|
let timer = std::time::Instant::now();
|
|
|
|
index.bulk_insert(&mut txn, &[0], elements.iter());
|
|
|
|
let time_spent = timer.elapsed().as_millis();
|
|
|
|
println!("bulk {size} : {time_spent}ms");
|
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
|
|
|
|
for nbr_doc in [1, 100, 1000, 10_000] {
|
|
|
|
let mut txn = index.env.write_txn().unwrap();
|
|
|
|
let timer = std::time::Instant::now();
|
|
|
|
//
|
|
|
|
// delete one document
|
|
|
|
//
|
|
|
|
for _ in 0..nbr_doc {
|
|
|
|
let deleted_u32 = r.gen::<u32>() % size;
|
|
|
|
let deleted_f64 = deleted_u32 as f64;
|
|
|
|
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
|
|
|
|
}
|
|
|
|
let time_spent = timer.elapsed().as_millis();
|
|
|
|
println!(" delete {nbr_doc} : {time_spent}ms");
|
|
|
|
txn.abort().unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|