Merge #287

287: Add benchmarks for indexing r=Kerollmops a=irevoire closes #274 I don't really know how much time this will take on our bench machine. I'm afraid the wiki dataset will take a really long time to bench (it takes 1h30 on my computer). If you are ok with it, I would like to merge this first PR since it introduces a first set of benchmarks and see how much time it takes in reality on our setup. Co-authored-by: Tamo <tamo@meilisearch.com>
2025-03-03 04:14:15 +08:00 · 2021-07-07 15:41:15 +00:00 · 2021-07-07 15:41:15 +00:00 · 16698f714b
commit 16698f714b
parent 4c9531bdf3 931021fe57
5 changed files with 336 additions and 12 deletions
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      dataset_name:
-        description: 'The name of the dataset used to benchmark (songs or wiki)'
+        description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
        required: false
        default: 'songs'

--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -28,3 +28,7 @@ harness = false
 [[bench]]
 name = "wiki"
 harness = false
+
+[[bench]]
+name = "indexing"
+harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._

 ### On your machine

-To run all the benchmarks (~4h):
+To run all the benchmarks (~5h):

 ```bash
 cargo bench
 ```

-To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
+To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:

 ```bash
 cargo bench --bench <dataset name>
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th

 ```bash
 mkdir ~/datasets
-MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
 touch build.rs
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
 ```
@ -84,6 +84,7 @@ Run the comparison script:
 The benchmarks are available for the following datasets:
 - `songs`
 - `wiki`
+- `movies`

 ### Songs

@ -107,5 +108,9 @@ It was generated with the following command:
 xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
 ```

-_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
+### Movies
+
+`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
+
+_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._

--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -0,0 +1,314 @@
+mod datasets_paths;
+
+use std::fs::{create_dir_all, remove_dir_all, File};
+use std::path::Path;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use heed::EnvOpenOptions;
+use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
+use milli::Index;
+
+#[cfg(target_os = "linux")]
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+fn setup_dir(path: impl AsRef<Path>) {
+    match remove_dir_all(path.as_ref()) {
+        Ok(_) => (),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
+        Err(e) => panic!("{}", e),
+    }
+    create_dir_all(path).unwrap();
+}
+
+fn setup_index() -> Index {
+    let path = "benches.mmdb";
+    setup_dir(&path);
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+    options.max_readers(10);
+    Index::new(options, path).unwrap()
+}
+
+fn indexing_songs_default(c: &mut Criterion) {
+    let index = setup_index();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    builder.set_primary_key("id".to_owned());
+    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_filterable_fields(faceted_fields);
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let index_ref = &index;
+
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing songs with default settings", |b| {
+        b.iter_with_setup(
+            move || {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
+                builder.execute().unwrap();
+                wtxn.commit().unwrap();
+                ()
+            },
+            move |_| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
+
+                builder.update_format(UpdateFormat::Csv);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
+                    "could not find the dataset in: {}",
+                    datasets_paths::SMOL_SONGS
+                ));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+            },
+        )
+    });
+
+    index.prepare_for_closing().wait();
+}
+
+fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
+    let index = setup_index();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    builder.set_primary_key("id".to_owned());
+    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
+    builder.set_filterable_fields(faceted_fields);
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let index_ref = &index;
+
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing songs without faceted numbers", |b| {
+        b.iter_with_setup(
+            move || {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
+                builder.execute().unwrap();
+                wtxn.commit().unwrap();
+                ()
+            },
+            move |_| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
+
+                builder.update_format(UpdateFormat::Csv);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
+                    "could not find the dataset in: {}",
+                    datasets_paths::SMOL_SONGS
+                ));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+            },
+        )
+    });
+    index.prepare_for_closing().wait();
+}
+
+fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
+    let index = setup_index();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    builder.set_primary_key("id".to_owned());
+    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let index_ref = &index;
+
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing songs without any facets", |b| {
+        b.iter_with_setup(
+            move || {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
+                builder.execute().unwrap();
+                wtxn.commit().unwrap();
+                ()
+            },
+            move |_| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
+
+                builder.update_format(UpdateFormat::Csv);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
+                    "could not find the dataset in: {}",
+                    datasets_paths::SMOL_SONGS
+                ));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+            },
+        )
+    });
+    index.prepare_for_closing().wait();
+}
+
+fn indexing_wiki(c: &mut Criterion) {
+    let index = setup_index();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    builder.set_primary_key("id".to_owned());
+    let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    // there is NO faceted fields at all
+
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let index_ref = &index;
+
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing wiki", |b| {
+        b.iter_with_setup(
+            move || {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
+                builder.execute().unwrap();
+                wtxn.commit().unwrap();
+                ()
+            },
+            move |_| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
+
+                builder.update_format(UpdateFormat::Csv);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
+                    "could not find the dataset in: {}",
+                    datasets_paths::SMOL_SONGS
+                ));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+            },
+        )
+    });
+    index.prepare_for_closing().wait();
+}
+
+fn indexing_movies_default(c: &mut Criterion) {
+    let index = setup_index();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    builder.set_primary_key("id".to_owned());
+    let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
+    builder.set_filterable_fields(faceted_fields);
+
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let index_ref = &index;
+
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing movies with default settings", |b| {
+        b.iter_with_setup(
+            move || {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
+                builder.execute().unwrap();
+                wtxn.commit().unwrap();
+                ()
+            },
+            move |_| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index_ref.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
+
+                builder.update_format(UpdateFormat::Json);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::MOVIES)
+                    .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+            },
+        )
+    });
+
+    index.prepare_for_closing().wait();
+}
+
+criterion_group!(
+    benches,
+    indexing_songs_default,
+    indexing_songs_without_faceted_numbers,
+    indexing_songs_without_faceted_fields,
+    indexing_wiki,
+    indexing_movies_default
+);
+criterion_main!(benches);
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -10,8 +10,9 @@ use reqwest::IntoUrl;

 const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";

-const DATASET_SONGS: &str = "smol-songs";
-const DATASET_WIKI: &str = "smol-wiki-articles";
+const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
+const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
+const DATASET_MOVIES: (&str, &str) = ("movies", "json");

 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> {
    )?;
    writeln!(manifest_paths_file)?;

-    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
        let out_path = out_dir.join(dataset);
-        let out_file = out_path.with_extension("csv");
+        let out_file = out_path.with_extension(extension);

        writeln!(
            &mut manifest_paths_file,
@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> {
        if out_file.exists() {
            eprintln!(
                "The dataset {} already exists on the file system and will not be downloaded again",
-                dataset
+                out_path.display(),
            );
            continue;
        }
-        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
        eprintln!("downloading: {}", url);
        let bytes = download_dataset(url.clone())?;
        eprintln!("{} downloaded successfully", url);
-        eprintln!("uncompressing in {}", out_path.display());
+        eprintln!("uncompressing in {}", out_file.display());
        uncompress_in_file(bytes, &out_file)?;
    }