meilisearch/benchmarks/benches/indexing.rs

1358 lines
48 KiB
Rust
Raw Normal View History

2021-07-07 17:42:14 +08:00
mod datasets_paths;
mod utils;
2021-07-07 17:42:14 +08:00
use std::fs::{create_dir_all, remove_dir_all};
2021-07-07 17:42:14 +08:00
use std::path::Path;
use criterion::{criterion_group, criterion_main, Criterion};
2022-08-11 17:15:46 +08:00
use milli::heed::{EnvOpenOptions, RwTxn};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
2021-07-07 17:42:14 +08:00
use milli::Index;
2022-06-16 16:17:58 +08:00
use rand::seq::SliceRandom;
use rand_chacha::rand_core::SeedableRng;
use roaring::RoaringBitmap;
2021-07-07 17:42:14 +08:00
2022-08-10 18:31:09 +08:00
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
2022-06-16 16:17:58 +08:00
const BENCHMARK_ITERATION: usize = 10;
2021-07-07 17:42:14 +08:00
fn setup_dir(path: impl AsRef<Path>) {
match remove_dir_all(path.as_ref()) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(path).unwrap();
}
fn setup_index() -> Index {
let path = "benches.mmdb";
2023-01-18 01:01:26 +08:00
setup_dir(path);
2021-07-07 17:42:14 +08:00
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
Index::new(options, path).unwrap()
}
2022-06-16 16:17:58 +08:00
fn setup_settings<'t>(
wtxn: &mut RwTxn<'t, '_>,
index: &'t Index,
primary_key: &str,
searchable_fields: &[&str],
filterable_fields: &[&str],
sortable_fields: &[&str],
) {
let config = IndexerConfig::default();
let mut builder = Settings::new(wtxn, index, &config);
builder.set_primary_key(primary_key.to_owned());
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(filterable_fields);
let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
builder.set_sortable_fields(sortable_fields);
builder.execute(|_| (), || false).unwrap();
2022-06-16 16:17:58 +08:00
}
2023-01-18 01:01:26 +08:00
fn setup_index_with_settings(
2022-06-16 16:17:58 +08:00
primary_key: &str,
searchable_fields: &[&str],
filterable_fields: &[&str],
sortable_fields: &[&str],
) -> milli::Index {
let index = setup_index();
let mut wtxn = index.write_txn().unwrap();
setup_settings(
&mut wtxn,
&index,
primary_key,
searchable_fields,
filterable_fields,
sortable_fields,
);
wtxn.commit().unwrap();
index
}
fn choose_document_ids_from_index_batched(
index: &Index,
count: usize,
batch_size: usize,
) -> Vec<RoaringBitmap> {
let rtxn = index.read_txn().unwrap();
// create batch of document ids to delete
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7700);
let document_ids: Vec<_> = index.documents_ids(&rtxn).unwrap().into_iter().collect();
let document_ids_to_delete: Vec<_> =
document_ids.choose_multiple(&mut rng, count).map(Clone::clone).collect();
document_ids_to_delete
.chunks(batch_size)
.map(|c| {
let mut batch = RoaringBitmap::new();
for id in c {
batch.insert(*id);
}
batch
})
.collect()
}
2021-07-07 17:42:14 +08:00
fn indexing_songs_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-07-07 17:42:14 +08:00
group.bench_function("Indexing songs with default settings", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields =
["released-timestamp", "duration-float", "genre", "country", "artist"];
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2021-07-07 17:42:14 +08:00
},
2021-08-19 21:02:43 +08:00
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
2021-08-19 21:02:43 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-07-07 17:42:14 +08:00
wtxn.commit().unwrap();
2021-08-19 21:02:43 +08:00
index.prepare_for_closing().wait();
2021-07-07 17:42:14 +08:00
},
)
});
}
2022-07-04 21:10:12 +08:00
fn reindexing_songs_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing songs with default settings", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields =
["released-timestamp", "duration-float", "genre", "country", "artist"];
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-07-04 21:10:12 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-06-16 16:17:58 +08:00
fn deleting_songs_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2022-06-16 16:32:58 +08:00
group.bench_function("-songs-delete-facetedString-facetedNumber-searchable-", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields =
["released-timestamp", "duration-float", "genre", "country", "artist"];
let sortable_fields = [];
2022-06-16 16:17:58 +08:00
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
// We index only one half of the dataset in the setup part
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
2022-06-16 16:17:58 +08:00
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-06-16 16:17:58 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-06-16 16:17:58 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
let count = 1250;
let batch_size = 250;
let document_ids_to_delete =
choose_document_ids_from_index_batched(&index, count, batch_size);
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
2023-11-06 18:56:46 +08:00
delete_documents_from_ids(index, document_ids_to_delete)
2022-06-16 16:17:58 +08:00
},
)
});
}
fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Indexing songs in three batches with default settings", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields =
["released-timestamp", "duration-float", "genre", "country", "artist"];
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
// We index only one half of the dataset in the setup part
2022-02-23 00:39:24 +08:00
// as we don't care about the time it takes.
let config = IndexerConfig::default();
2022-06-16 16:17:58 +08:00
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2021-07-07 17:42:14 +08:00
fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-07-07 17:42:14 +08:00
group.bench_function("Indexing songs without faceted numbers", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields = ["genre", "country", "artist"];
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2021-07-07 17:42:14 +08:00
},
2021-08-19 21:02:43 +08:00
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
2021-08-19 21:02:43 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-07-07 17:42:14 +08:00
wtxn.commit().unwrap();
2021-08-19 21:02:43 +08:00
index.prepare_for_closing().wait();
2021-07-07 17:42:14 +08:00
},
)
});
}
fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-07-07 17:42:14 +08:00
group.bench_function("Indexing songs without any facets", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields = [];
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2021-07-07 17:42:14 +08:00
},
2021-08-19 21:02:43 +08:00
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
2021-08-19 21:02:43 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-07-07 17:42:14 +08:00
wtxn.commit().unwrap();
2021-08-19 21:02:43 +08:00
index.prepare_for_closing().wait();
2021-07-07 17:42:14 +08:00
},
)
});
}
fn indexing_wiki(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-07-07 17:42:14 +08:00
group.bench_function("Indexing wiki", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "body"];
let filterable_fields = [];
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2021-07-07 17:42:14 +08:00
},
2021-08-19 21:02:43 +08:00
move |index| {
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
2021-08-19 21:02:43 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-07-07 17:42:14 +08:00
wtxn.commit().unwrap();
2021-08-19 21:02:43 +08:00
index.prepare_for_closing().wait();
2021-07-07 17:42:14 +08:00
},
)
});
}
2022-07-04 21:10:12 +08:00
fn reindexing_wiki(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing wiki", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "body"];
let filterable_fields = [];
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-07-04 21:10:12 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-06-16 16:17:58 +08:00
fn deleting_wiki_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2022-06-16 16:32:58 +08:00
group.bench_function("-wiki-delete-searchable-", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "body"];
let filterable_fields = [];
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
2022-06-16 16:17:58 +08:00
// We index only one half of the dataset in the setup part
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
2022-06-16 16:17:58 +08:00
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-06-16 16:17:58 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-06-16 16:17:58 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
let count = 1250;
let batch_size = 250;
let document_ids_to_delete =
choose_document_ids_from_index_batched(&index, count, batch_size);
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
2023-11-06 18:56:46 +08:00
delete_documents_from_ids(index, document_ids_to_delete)
2022-06-16 16:17:58 +08:00
},
)
});
}
fn indexing_wiki_in_three_batches(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Indexing wiki in three batches", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "body"];
let filterable_fields = [];
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
let mut wtxn = index.write_txn().unwrap();
// We index only one half of the dataset in the setup part
2022-02-23 00:39:24 +08:00
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2021-07-07 17:42:14 +08:00
fn indexing_movies_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-07-07 17:42:14 +08:00
group.bench_function("Indexing movies with default settings", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "overview"];
let filterable_fields = ["release_date", "genres"];
2022-06-16 16:17:58 +08:00
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2021-07-07 17:42:14 +08:00
},
2021-08-19 21:02:43 +08:00
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
2021-08-19 21:02:43 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2021-07-07 17:42:14 +08:00
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-07-07 17:42:14 +08:00
wtxn.commit().unwrap();
2021-08-19 21:02:43 +08:00
index.prepare_for_closing().wait();
2021-07-07 17:42:14 +08:00
},
)
});
}
2022-07-04 21:10:12 +08:00
fn reindexing_movies_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing movies with default settings", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "overview"];
let filterable_fields = ["release_date", "genres"];
2022-07-04 21:10:12 +08:00
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-07-04 21:10:12 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-06-16 16:17:58 +08:00
fn deleting_movies_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2022-06-16 16:32:58 +08:00
group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = ["title", "overview"];
let filterable_fields = ["release_date", "genres"];
2022-06-16 16:17:58 +08:00
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
2022-06-16 16:17:58 +08:00
// We index only one half of the dataset in the setup part
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
2022-06-16 16:17:58 +08:00
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-06-16 16:17:58 +08:00
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-06-16 16:17:58 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
let count = 1250;
let batch_size = 250;
let document_ids_to_delete =
choose_document_ids_from_index_batched(&index, count, batch_size);
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
2023-11-06 18:56:46 +08:00
delete_documents_from_ids(index, document_ids_to_delete)
2022-06-16 16:17:58 +08:00
},
)
});
}
2023-11-06 18:56:46 +08:00
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
let mut wtxn = index.write_txn().unwrap();
let indexer_config = IndexerConfig::default();
for ids in document_ids_to_delete {
let external_documents_ids = index.external_documents_ids();
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
// Since what we have is an iterator, it would be better to delete in chunks
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
external_documents_ids
.find_external_id_of(&wtxn, ids)
.unwrap()
.only_external_ids()
.collect();
let ids = external_to_internal.unwrap();
let config = IndexDocumentsConfig::default();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
.unwrap();
(builder, _) = builder.remove_documents(ids).unwrap();
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
2022-06-16 16:17:58 +08:00
fn indexing_movies_in_three_batches(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Indexing movies in three batches", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "overview"];
let filterable_fields = ["release_date", "genres"];
2022-06-16 16:17:58 +08:00
let sortable_fields = [];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
2022-06-16 16:17:58 +08:00
let mut wtxn = index.write_txn().unwrap();
// We index only one half of the dataset in the setup part
2022-02-23 00:39:24 +08:00
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-05-02 23:00:03 +08:00
fn indexing_nested_movies_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2022-05-02 23:00:03 +08:00
group.bench_function("Indexing nested movies with default settings", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
2022-05-02 23:00:03 +08:00
let searchable_fields = [
"title",
"overview",
"provider_names",
"genres",
"crew.name",
"cast.character",
"cast.name",
2022-06-16 16:17:58 +08:00
];
2022-05-02 23:00:03 +08:00
let filterable_fields = [
"popularity",
"release_date",
"runtime",
"vote_average",
"external_ids",
"keywords",
"providers.buy.name",
"providers.rent.name",
"providers.flatrate.name",
"provider_names",
"genres",
"crew.name",
"cast.character",
"cast.name",
2022-06-16 16:17:58 +08:00
];
let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2022-05-02 23:00:03 +08:00
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-05-02 23:00:03 +08:00
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-05-02 23:00:03 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-06-16 16:17:58 +08:00
fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
2022-05-02 23:00:03 +08:00
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2022-06-16 16:32:58 +08:00
group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-nested-", |b| {
2022-05-02 23:00:03 +08:00
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "id";
let searchable_fields = [
"title",
"overview",
"provider_names",
"genres",
"crew.name",
"cast.character",
"cast.name",
];
let filterable_fields = [
"popularity",
"release_date",
"runtime",
"vote_average",
"external_ids",
"keywords",
"providers.buy.name",
"providers.rent.name",
"providers.flatrate.name",
"provider_names",
"genres",
"crew.name",
"cast.character",
"cast.name",
];
let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"];
2022-05-02 23:00:03 +08:00
2022-06-16 16:17:58 +08:00
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
// We index only one half of the dataset in the setup part
// as we don't care about the time it takes.
2022-05-02 23:00:03 +08:00
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
2022-06-16 16:17:58 +08:00
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-06-16 16:17:58 +08:00
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-06-16 16:17:58 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
let count = 1250;
let batch_size = 250;
let document_ids_to_delete =
choose_document_ids_from_index_batched(&index, count, batch_size);
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
2023-11-06 18:56:46 +08:00
delete_documents_from_ids(index, document_ids_to_delete)
2022-06-16 16:17:58 +08:00
},
)
});
}
2022-05-02 23:00:03 +08:00
2022-06-16 16:17:58 +08:00
fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Indexing nested movies without any facets", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
2022-05-02 23:00:03 +08:00
let searchable_fields = [
"title",
"overview",
"provider_names",
"genres",
"crew.name",
"cast.character",
"cast.name",
2022-06-16 16:17:58 +08:00
];
let filterable_fields = [];
let sortable_fields = [];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
2022-05-02 23:00:03 +08:00
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-05-02 23:00:03 +08:00
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-05-02 23:00:03 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2021-09-14 00:08:28 +08:00
fn indexing_geo(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
2022-06-16 16:17:58 +08:00
group.sample_size(BENCHMARK_ITERATION);
2021-09-14 00:08:28 +08:00
group.bench_function("Indexing geo_point", |b| {
b.iter_with_setup(
move || {
2022-06-16 16:17:58 +08:00
let primary_key = "geonameid";
let searchable_fields = ["name", "alternatenames", "elevation"];
let filterable_fields = ["_geo", "population", "elevation"];
let sortable_fields = ["_geo", "population", "elevation"];
setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
)
},
move |index| {
let config = IndexerConfig::default();
2022-06-16 16:17:58 +08:00
let indexing_config = IndexDocumentsConfig::default();
2021-09-14 00:08:28 +08:00
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2021-09-14 00:08:28 +08:00
2022-06-16 16:17:58 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-06-16 16:17:58 +08:00
builder.execute().unwrap();
2021-09-14 00:08:28 +08:00
2022-06-16 16:17:58 +08:00
wtxn.commit().unwrap();
2021-09-14 00:08:28 +08:00
2022-06-16 16:17:58 +08:00
index.prepare_for_closing().wait();
},
)
});
}
2021-09-14 00:08:28 +08:00
2022-07-04 21:10:12 +08:00
fn reindexing_geo(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing geo_point", |b| {
b.iter_with_setup(
move || {
let primary_key = "geonameid";
let searchable_fields = ["name", "alternatenames", "elevation"];
let filterable_fields = ["_geo", "population", "elevation"];
let sortable_fields = ["_geo", "population", "elevation"];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-07-04 21:10:12 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
2022-07-04 21:10:12 +08:00
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
2022-07-04 21:10:12 +08:00
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
},
)
});
}
2022-06-16 16:17:58 +08:00
fn deleting_geo_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
2022-06-16 16:32:58 +08:00
group.bench_function("-geo-delete-facetedNumber-facetedGeo-searchable-", |b| {
2022-06-16 16:17:58 +08:00
b.iter_with_setup(
move || {
let primary_key = "geonameid";
let searchable_fields = ["name", "alternatenames", "elevation"];
let filterable_fields = ["_geo", "population", "elevation"];
let sortable_fields = ["_geo", "population", "elevation"];
let index = setup_index_with_settings(
2023-01-18 01:01:26 +08:00
primary_key,
2022-06-16 16:17:58 +08:00
&searchable_fields,
&filterable_fields,
&sortable_fields,
);
2021-09-14 00:08:28 +08:00
2022-06-16 16:17:58 +08:00
// We index only one half of the dataset in the setup part
// as we don't care about the time it takes.
let config = IndexerConfig::default();
2021-09-14 00:08:28 +08:00
let mut wtxn = index.write_txn().unwrap();
2022-06-16 16:17:58 +08:00
let indexing_config = IndexDocumentsConfig::default();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&config,
indexing_config,
|_| (),
|| false,
)
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
2022-06-15 20:35:19 +08:00
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2022-06-16 16:17:58 +08:00
wtxn.commit().unwrap();
let count = 1250;
let batch_size = 250;
let document_ids_to_delete =
choose_document_ids_from_index_batched(&index, count, batch_size);
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
2023-11-06 18:56:46 +08:00
delete_documents_from_ids(index, document_ids_to_delete)
2021-09-14 00:08:28 +08:00
},
)
});
}
2021-07-07 17:42:14 +08:00
criterion_group!(
benches,
indexing_songs_default,
2022-07-04 21:10:12 +08:00
reindexing_songs_default,
2022-06-16 16:17:58 +08:00
deleting_songs_in_batches_default,
2021-07-07 17:42:14 +08:00
indexing_songs_without_faceted_numbers,
indexing_songs_without_faceted_fields,
indexing_songs_in_three_batches_default,
2021-07-07 17:42:14 +08:00
indexing_wiki,
2022-07-04 21:10:12 +08:00
reindexing_wiki,
2022-06-16 16:17:58 +08:00
deleting_wiki_in_batches_default,
indexing_wiki_in_three_batches,
2021-09-14 00:08:28 +08:00
indexing_movies_default,
2022-07-04 21:10:12 +08:00
reindexing_movies_default,
2022-06-16 16:17:58 +08:00
deleting_movies_in_batches_default,
indexing_movies_in_three_batches,
2022-05-02 23:00:03 +08:00
indexing_nested_movies_default,
2022-06-16 16:17:58 +08:00
deleting_nested_movies_in_batches_default,
2022-05-02 23:00:03 +08:00
indexing_nested_movies_without_faceted_fields,
2022-06-16 16:17:58 +08:00
indexing_geo,
2022-07-04 21:10:12 +08:00
reindexing_geo,
2022-06-16 16:17:58 +08:00
deleting_geo_in_batches_default
2021-07-07 17:42:14 +08:00
);
criterion_main!(benches);