From 931021fe57f78a0204acd0bb594e300c829bbedc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Jul 2021 11:42:14 +0200 Subject: [PATCH] add benchmarks for indexing --- .github/workflows/benchmarks.yml | 2 +- benchmarks/Cargo.toml | 4 + benchmarks/README.md | 13 +- benchmarks/benches/indexing.rs | 314 +++++++++++++++++++++++++++++++ benchmarks/build.rs | 15 +- 5 files changed, 336 insertions(+), 12 deletions(-) create mode 100644 benchmarks/benches/indexing.rs diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e110c6be5..553f7e424 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (songs or wiki)' + description: 'The name of the dataset used to benchmark (songs, wiki or indexing)' required: false default: 'songs' diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b5ba9bf4f..dd319b4e6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -28,3 +28,7 @@ harness = false [[bench]] name = "wiki" harness = false + +[[bench]] +name = "indexing" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md index 843ea9b29..16838e488 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._ ### On your machine -To run all the benchmarks (~4h): +To run all the benchmarks (~5h): ```bash cargo bench ``` -To run only the `songs` (~1h) or `wiki` (~3h) benchmark: +To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark: ```bash cargo bench --bench @@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th ```bash mkdir ~/datasets -MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` @@ -84,6 +84,7 @@ Run the comparison script: The benchmarks are available for the following datasets: - `songs` - `wiki` +- `movies` ### Songs @@ -107,5 +108,9 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` -_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ +### Movies + +`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/) + +_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs new file mode 100644 index 000000000..902b34dc8 --- /dev/null +++ b/benchmarks/benches/indexing.rs @@ -0,0 +1,314 @@ +mod datasets_paths; + +use std::fs::{create_dir_all, remove_dir_all, File}; +use std::path::Path; + +use criterion::{criterion_group, criterion_main, Criterion}; +use heed::EnvOpenOptions; +use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; +use milli::Index; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn setup_dir(path: impl AsRef) { + match remove_dir_all(path.as_ref()) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } + create_dir_all(path).unwrap(); +} + +fn setup_index() -> Index { + let path = "benches.mmdb"; + setup_dir(&path); + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + Index::new(options, path).unwrap() +} + +fn indexing_songs_default(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs with default settings", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + + index.prepare_for_closing().wait(); +} + +fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs without faceted numbers", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_songs_without_faceted_fields(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs without any facets", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_wiki(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + // there is NO faceted fields at all + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing wiki", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_movies_default(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing movies with default settings", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Json); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::MOVIES) + .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES)); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + + index.prepare_for_closing().wait(); +} + +criterion_group!( + benches, + indexing_songs_default, + indexing_songs_without_faceted_numbers, + indexing_songs_without_faceted_fields, + indexing_wiki, + indexing_movies_default +); +criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 827c2c2a3..47a14f25b 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -10,8 +10,9 @@ use reqwest::IntoUrl; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; -const DATASET_SONGS: &str = "smol-songs"; -const DATASET_WIKI: &str = "smol-wiki-articles"; +const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); +const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); +const DATASET_MOVIES: (&str, &str) = ("movies", "json"); /// The name of the environment variable used to select the path /// of the directory containing the datasets @@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for dataset in &[DATASET_SONGS, DATASET_WIKI] { + for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] { let out_path = out_dir.join(dataset); - let out_file = out_path.with_extension("csv"); + let out_file = out_path.with_extension(extension); writeln!( &mut manifest_paths_file, @@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> { if out_file.exists() { eprintln!( "The dataset {} already exists on the file system and will not be downloaded again", - dataset + out_path.display(), ); continue; } - let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension); eprintln!("downloading: {}", url); let bytes = download_dataset(url.clone())?; eprintln!("{} downloaded successfully", url); - eprintln!("uncompressing in {}", out_path.display()); + eprintln!("uncompressing in {}", out_file.display()); uncompress_in_file(bytes, &out_file)?; }