Add facet deletion tests that use both the incremental and bulk methods

+ update deletion snapshots to the new database format
This commit is contained in:
Loïc Lecrenier 2022-10-12 12:32:33 +02:00
parent e3ba1fc883
commit f198b20c42
19 changed files with 302 additions and 146 deletions

View File

@ -54,7 +54,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
#[cfg(test)]
fn disable_soft_deletion(&mut self, disable: bool) {
pub fn disable_soft_deletion(&mut self, disable: bool) {
self.disable_soft_deletion = disable;
}

View File

@ -4,9 +4,7 @@ use std::fs::File;
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn};
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
@ -71,8 +69,6 @@ impl<'i> FacetsUpdateBulk<'i> {
#[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
let db = match facet_type {
@ -84,8 +80,6 @@ impl<'i> FacetsUpdateBulk<'i> {
}
};
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {

View File

@ -1,15 +1,21 @@
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::{
facet::FacetType,
heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec},
heed_codec::ByteSliceRefCodec,
update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner},
FieldId, Index, Result,
};
use heed::RwTxn;
use roaring::RoaringBitmap;
use std::collections::{HashMap, HashSet};
use heed::RwTxn;
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
use crate::{FieldId, Index, Result};
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
///
/// Depending on the number of removed elements and the existing size of the database, we use either
/// a bulk delete method or an incremental delete method.
pub struct FacetsDelete<'i, 'b> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@ -48,8 +54,18 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
}
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
for (field_id, affected_facet_values) in self.affected_facet_values {
if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) {
// This is an incorrect condition, since we assume that the length of the database is equal
// to the number of facet values for the given field_id. It means that in some cases, we might
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
// really be a performance problem is when we fully delete a large ratio of all facet values for
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
// Bulk delete
let mut modified = false;
@ -91,3 +107,133 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use big_s::S;
use maplit::hashset;
use roaring::RoaringBitmap;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::update::DeleteDocuments;
#[test]
fn delete_mixed_incremental_and_bulk() {
// The point of this test is to create an index populated with documents
// containing different filterable attributes. Then, we delete a bunch of documents
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": i / 10,
"colour": i / 100,
"timestamp": i / 2,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, 1);
db_snap!(index, number_faceted_documents_ids, 1);
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.disable_soft_deletion(true);
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
// by deleting the first 100 documents, we expect that:
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
builder.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_f64_docids, 2);
db_snap!(index, number_faceted_documents_ids, 2);
}
}
#[allow(unused)]
#[cfg(test)]
mod comparison_bench {
use std::iter::once;
use rand::Rng;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::facet::tests::FacetIndex;
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.
//
// The benchmark shows the worst-case scenario for the incremental indexer, since
// each facet value contains only one document ID.
//
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
// bulk indexer.
// #[test]
fn benchmark_facet_indexing_delete() {
let mut r = rand::thread_rng();
for i in 1..=20 {
let size = 50_000 * i;
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
for i in 0..size {
// field id = 0, left_bound = i, docids = [i]
elements.push(((0, i as f64), once(i).collect()));
}
let timer = std::time::Instant::now();
index.bulk_insert(&mut txn, &[0], elements.iter());
let time_spent = timer.elapsed().as_millis();
println!("bulk {size} : {time_spent}ms");
txn.commit().unwrap();
for nbr_doc in [1, 100, 1000, 10_000] {
let mut txn = index.env.write_txn().unwrap();
let timer = std::time::Instant::now();
//
// delete one document
//
for _ in 0..nbr_doc {
let deleted_u32 = r.gen::<u32>() % size;
let deleted_f64 = deleted_u32 as f64;
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
}
let time_spent = timer.elapsed().as_millis();
println!(" delete {nbr_doc} : {time_spent}ms");
txn.abort().unwrap();
}
}
}
}

View File

@ -78,6 +78,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::fs::File;
use log::debug;
use time::OffsetDateTime;
use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk;
use crate::facet::FacetType;
@ -89,6 +92,10 @@ pub mod bulk;
pub mod delete;
pub mod incremental;
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
///
/// Depending on the number of new elements and the existing size of the database, we use either
/// a bulk update method or an incremental update method.
pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@ -123,6 +130,10 @@ impl<'i> FacetsUpdate<'i> {
if self.new_data.is_empty() {
return Ok(());
}
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// See self::comparison_bench::benchmark_facet_indexing
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
@ -204,7 +215,7 @@ pub(crate) mod tests {
let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 10 * 100);
let options = options.map_size(4096 * 4 * 10 * 1000);
unsafe {
options.flag(heed::flags::Flags::MdbAlwaysFreePages);
}
@ -230,7 +241,7 @@ pub(crate) mod tests {
let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 1000);
let options = options.map_size(4096 * 4 * 1000 * 100);
let tempdir = tempfile::TempDir::new().unwrap();
let env = options.open(tempdir.path()).unwrap();
let content = env.create_database(None).unwrap();
@ -440,12 +451,14 @@ mod comparison_bench {
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.
// It appears that the incremental indexer is about 50 times slower than the
//
// The benchmark shows the worst-case scenario for the incremental indexer, since
// each facet value contains only one document ID.
//
// In that scenario, it appears that the incremental indexer is about 50 times slower than the
// bulk indexer.
// #[test]
fn benchmark_facet_indexing() {
// then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it
let mut facet_value = 0;
let mut r = rand::thread_rng();

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
550cd138d6fe31ccdd42cd5392fbd576

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
9a0ea88e7c9dcf6dc0ef0b601736ffcf

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
d4d5f14e7f1e1f09b86821a0b6defcc6

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
3570e0ac0fdb21be9ebe433f59264b56

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/delete_documents.rs
---

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/delete_documents.rs
---

View File

@ -1,6 +1,6 @@
---
source: milli/src/update/delete_documents.rs
---
2 0 1.2 1.2 [20, 22, ]
2 0 2.2 2.2 [21, ]
2 0 1.2 1 [20, 22, ]
2 0 2.2 1 [21, ]

View File

@ -1,19 +1,19 @@
---
source: milli/src/update/delete_documents.rs
---
1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
1 aquarium aquarium [5, ]
1 art art [4, 5, 8, 9, 10, 12, 17, ]
1 cartoon cartoon [2, 7, 15, 17, ]
1 colorfulness colorfulness [13, ]
1 design design [2, 18, ]
1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ]
1 geometry geometry [19, ]
1 letter letter [1, ]
1 outdoor outdoor [4, ]
1 painting painting [3, ]
1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ]
1 sign sign [0, ]
2 design design [21, ]
2 geometry geometry [20, 22, ]
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
1 0 aquarium 1 [5, ]
1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ]
1 0 cartoon 1 [2, 7, 15, 17, ]
1 0 colorfulness 1 [13, ]
1 0 design 1 [2, 18, ]
1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ]
1 0 geometry 1 [19, ]
1 0 letter 1 [1, ]
1 0 outdoor 1 [4, ]
1 0 painting 1 [3, ]
1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ]
1 0 sign 1 [0, ]
2 0 design 1 [21, ]
2 0 geometry 1 [20, 22, ]

View File

@ -1,4 +1,4 @@
---
source: milli/src/update/delete_documents.rs
---
[0, ]
[0, 20, 22, ]

View File

@ -2,5 +2,5 @@
source: milli/src/update/delete_documents.rs
---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
2 [20, 21, 22, ]
2 [21, ]

View File

@ -1,6 +1,5 @@
---
source: milli/src/update/delete_documents.rs
---
2 0 1.2 1.2 [20, 22, ]
2 0 2.2 2.2 [21, ]
2 0 2.2 1 [21, ]

View File

@ -1,18 +1,17 @@
---
source: milli/src/update/delete_documents.rs
---
1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
1 aquarium aquarium [5, ]
1 art art [4, 5, 8, 9, 10, 12, 17, ]
1 cartoon cartoon [2, 7, 15, 17, ]
1 colorfulness colorfulness [13, ]
1 design design [2, 18, ]
1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ]
1 geometry geometry [19, ]
1 letter letter [1, ]
1 outdoor outdoor [4, ]
1 painting painting [3, ]
1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ]
2 design design [21, ]
2 geometry geometry [20, 22, ]
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
1 0 aquarium 1 [5, ]
1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ]
1 0 cartoon 1 [2, 7, 15, 17, ]
1 0 colorfulness 1 [13, ]
1 0 design 1 [2, 18, ]
1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ]
1 0 geometry 1 [19, ]
1 0 letter 1 [1, ]
1 0 outdoor 1 [4, ]
1 0 painting 1 [3, ]
1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ]
2 0 design 1 [21, ]

View File

@ -1,7 +1,6 @@
---
source: milli/src/update/delete_documents.rs
---
1.2 [20, 22, ]
1_36 [3, ]
1_37 [4, ]
1_38 [5, ]
@ -21,9 +20,7 @@ source: milli/src/update/delete_documents.rs
1_68 [18, ]
1_69 [19, ]
1_7 [2, ]
1_70 [20, ]
1_71 [21, ]
1_72 [22, ]
2.2 [21, ]
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
aquarium [5, ]
@ -32,7 +29,7 @@ cartoon [2, 7, 15, 17, ]
colorfulness [13, ]
design [2, 18, 21, ]
drawing [3, 4, 5, 8, 10, 11, 16, ]
geometry [19, 20, 22, ]
geometry [19, ]
letter [1, ]
outdoor [4, ]
painting [3, ]

View File

@ -1,48 +1,53 @@
---
source: milli/src/update/delete_documents.rs
---
3 0 48.9021 48.9021 [19, ]
3 0 49.4449 49.4449 [18, ]
3 0 49.9314 49.9314 [17, ]
3 0 50.1112 50.1112 [16, ]
3 0 50.1793 50.1793 [15, ]
3 0 50.2844 50.2844 [14, ]
3 0 50.3518 50.3518 [13, ]
3 0 50.4095 50.4095 [11, ]
3 0 50.4502 50.4502 [12, ]
3 0 50.6053 50.6053 [8, ]
3 0 50.6224 50.6224 [3, ]
3 0 50.6299 50.6299 [0, ]
3 0 50.6312 50.6312 [2, ]
3 0 50.6415 50.6415 [1, ]
3 0 50.6552 50.6552 [4, ]
3 0 50.6924 50.6924 [5, ]
3 0 50.7263 50.7263 [6, ]
3 0 50.7453 50.7453 [7, ]
3 0 50.8466 50.8466 [10, ]
3 0 51.0537 51.0537 [9, ]
3 1 48.9021 50.1112 [16, 17, 18, 19, ]
3 1 50.1793 50.4095 [11, 13, 14, 15, ]
3 1 50.4502 50.6299 [0, 3, 8, 12, ]
3 1 50.6312 50.6924 [1, 2, 4, 5, ]
3 1 50.7263 51.0537 [6, 7, 9, 10, ]
4 0 2.271 2.271 [17, ]
4 0 2.3708 2.3708 [19, ]
4 0 2.7637 2.7637 [14, ]
4 0 2.7913 2.7913 [18, ]
4 0 2.8547 2.8547 [16, ]
4 0 3.0569 3.0569 [0, ]
4 0 3.1106 3.1106 [1, 2, ]
4 0 3.1476 3.1476 [3, ]
4 0 3.1541 3.1541 [6, ]
4 0 3.1763 3.1763 [5, ]
4 0 3.1897 3.1897 [4, ]
4 0 3.2189 3.2189 [15, ]
4 0 3.2206 3.2206 [7, ]
4 0 3.3758 3.3758 [8, ]
4 0 3.5326 3.5326 [13, ]
4 0 3.6957 3.6957 [9, ]
4 0 3.9623 3.9623 [12, ]
4 0 4.337 4.337 [10, ]
4 0 4.4347 4.4347 [11, ]
3 0 48.9021 1 [19, ]
3 0 49.4449 1 [18, ]
3 0 49.9314 1 [17, ]
3 0 50.1112 1 [16, ]
3 0 50.1793 1 [15, ]
3 0 50.2844 1 [14, ]
3 0 50.3518 1 [13, ]
3 0 50.4095 1 [11, ]
3 0 50.4502 1 [12, ]
3 0 50.6053 1 [8, ]
3 0 50.6224 1 [3, ]
3 0 50.6299 1 [0, ]
3 0 50.6312 1 [2, ]
3 0 50.6415 1 [1, ]
3 0 50.6552 1 [4, ]
3 0 50.6924 1 [5, ]
3 0 50.7263 1 [6, ]
3 0 50.7453 1 [7, ]
3 0 50.8466 1 [10, ]
3 0 51.0537 1 [9, ]
3 1 48.9021 4 [16, 17, 18, 19, ]
3 1 50.1793 4 [11, 13, 14, 15, ]
3 1 50.4502 4 [0, 3, 8, 12, ]
3 1 50.6312 4 [1, 2, 4, 5, ]
3 1 50.7263 4 [6, 7, 9, 10, ]
4 0 2.271 1 [17, ]
4 0 2.3708 1 [19, ]
4 0 2.7637 1 [14, ]
4 0 2.7913 1 [18, ]
4 0 2.8547 1 [16, ]
4 0 3.0569 1 [0, ]
4 0 3.1106 1 [1, 2, ]
4 0 3.1476 1 [3, ]
4 0 3.1541 1 [6, ]
4 0 3.1763 1 [5, ]
4 0 3.1897 1 [4, ]
4 0 3.2189 1 [15, ]
4 0 3.2206 1 [7, ]
4 0 3.3758 1 [8, ]
4 0 3.5326 1 [13, ]
4 0 3.6957 1 [9, ]
4 0 3.9623 1 [12, ]
4 0 4.337 1 [10, ]
4 0 4.4347 1 [11, ]
4 1 2.271 4 [14, 17, 18, 19, ]
4 1 2.8547 4 [0, 1, 2, 3, 16, ]
4 1 3.1541 4 [4, 5, 6, 15, ]
4 1 3.2206 4 [7, 8, 9, 13, ]
4 1 3.9623 3 [10, 11, 12, ]

View File

@ -1,36 +1,31 @@
---
source: milli/src/update/delete_documents.rs
---
3 0 48.9021 48.9021 [19, ]
3 0 49.9314 49.9314 [17, ]
3 0 50.1793 50.1793 [15, ]
3 0 50.2844 50.2844 [14, ]
3 0 50.3518 50.3518 [13, ]
3 0 50.4502 50.4502 [12, ]
3 0 50.6053 50.6053 [8, ]
3 0 50.6224 50.6224 [3, ]
3 0 50.6299 50.6299 [0, ]
3 0 50.6312 50.6312 [2, ]
3 0 50.6415 50.6415 [1, ]
3 0 50.7453 50.7453 [7, ]
3 0 50.8466 50.8466 [10, ]
3 0 51.0537 51.0537 [9, ]
3 1 48.9021 50.1112 [17, 19, ]
3 1 50.1793 50.4095 [13, 14, 15, ]
3 1 50.4502 50.6299 [0, 3, 8, 12, ]
3 1 50.6312 50.6924 [1, 2, ]
3 1 50.7263 51.0537 [7, 9, 10, ]
4 0 2.271 2.271 [17, ]
4 0 2.3708 2.3708 [19, ]
4 0 2.7637 2.7637 [14, ]
4 0 3.0569 3.0569 [0, ]
4 0 3.1106 3.1106 [1, 2, ]
4 0 3.1476 3.1476 [3, ]
4 0 3.2189 3.2189 [15, ]
4 0 3.2206 3.2206 [7, ]
4 0 3.3758 3.3758 [8, ]
4 0 3.5326 3.5326 [13, ]
4 0 3.6957 3.6957 [9, ]
4 0 3.9623 3.9623 [12, ]
4 0 4.337 4.337 [10, ]
3 0 48.9021 1 [19, ]
3 0 49.9314 1 [17, ]
3 0 50.1793 1 [15, ]
3 0 50.2844 1 [14, ]
3 0 50.3518 1 [13, ]
3 0 50.4502 1 [12, ]
3 0 50.6053 1 [8, ]
3 0 50.6224 1 [3, ]
3 0 50.6299 1 [0, ]
3 0 50.6312 1 [2, ]
3 0 50.6415 1 [1, ]
3 0 50.7453 1 [7, ]
3 0 50.8466 1 [10, ]
3 0 51.0537 1 [9, ]
4 0 2.271 1 [17, ]
4 0 2.3708 1 [19, ]
4 0 2.7637 1 [14, ]
4 0 3.0569 1 [0, ]
4 0 3.1106 1 [1, 2, ]
4 0 3.1476 1 [3, ]
4 0 3.2189 1 [15, ]
4 0 3.2206 1 [7, ]
4 0 3.3758 1 [8, ]
4 0 3.5326 1 [13, ]
4 0 3.6957 1 [9, ]
4 0 3.9623 1 [12, ]
4 0 4.337 1 [10, ]