From ab5247dc6413ac69f09cccb9a5a5a86e6180dea0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Feb 2022 16:30:13 +0100 Subject: [PATCH 1/4] Add a new songs benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 72 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 15 ++++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index a84998b12..8536dabe8 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -83,6 +83,77 @@ fn indexing_songs_default(c: &mut Criterion) { }); } +fn indexing_songs_in_three_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs in three batches with default settings", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -332,6 +403,7 @@ criterion_group!( indexing_songs_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, + indexing_songs_in_three_batches_default, indexing_wiki, indexing_movies_default, indexing_geo diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 2495930bb..90ebf70af 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -11,10 +11,23 @@ use reqwest::IntoUrl; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); +const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv"); +const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv"); +const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); +const ALL_DATASETS: &[(&str, &str)] = &[ + DATASET_SONGS, + DATASET_SONGS_1_2, + DATASET_SONGS_3_4, + DATASET_SONGS_4_4, + DATASET_WIKI, + DATASET_MOVIES, + DATASET_GEO, +]; + /// The name of the environment variable used to select the path /// of the directory containing the datasets const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; @@ -33,7 +46,7 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] { + for (dataset, extension) in ALL_DATASETS { let out_path = out_dir.join(dataset); let out_file = out_path.with_extension(extension); From 8d2e3e4aba86d62df0faf1f08d561c17e55951c0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Feb 2022 17:59:03 +0100 Subject: [PATCH 2/4] Add a new wiki benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 71 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 6 +++ 2 files changed, 77 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 8536dabe8..97eee0a34 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -294,6 +294,76 @@ fn indexing_wiki(c: &mut Criterion) { }); } +fn indexing_wiki_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing wiki in three batches", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + // there is NO faceted fields at all + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_movies_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -405,6 +475,7 @@ criterion_group!( indexing_songs_without_faceted_fields, indexing_songs_in_three_batches_default, indexing_wiki, + indexing_wiki_in_three_batches, indexing_movies_default, indexing_geo ); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 90ebf70af..66a0a841b 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -15,6 +15,9 @@ const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv"); const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv"); const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); +const DATASET_WIKI_1_2: (&str, &str) = ("smol-wiki-articles-1_2", "csv"); +const DATASET_WIKI_3_4: (&str, &str) = ("smol-wiki-articles-3_4", "csv"); +const DATASET_WIKI_4_4: (&str, &str) = ("smol-wiki-articles-4_4", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); @@ -24,6 +27,9 @@ const ALL_DATASETS: &[(&str, &str)] = &[ DATASET_SONGS_3_4, DATASET_SONGS_4_4, DATASET_WIKI, + DATASET_WIKI_1_2, + DATASET_WIKI_3_4, + DATASET_WIKI_4_4, DATASET_MOVIES, DATASET_GEO, ]; From a820aa11e6f252934eca61e859029deea3013792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Feb 2022 13:47:37 +0100 Subject: [PATCH 3/4] Add a new movies benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 72 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 6 +++ 2 files changed, 78 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 97eee0a34..b9ad7cad9 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -413,6 +413,77 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn indexing_movies_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing movies in three batches", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "overview"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_geo(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -477,6 +548,7 @@ criterion_group!( indexing_wiki, indexing_wiki_in_three_batches, indexing_movies_default, + indexing_movies_in_three_batches, indexing_geo ); criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 66a0a841b..906230fd4 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -19,6 +19,9 @@ const DATASET_WIKI_1_2: (&str, &str) = ("smol-wiki-articles-1_2", "csv"); const DATASET_WIKI_3_4: (&str, &str) = ("smol-wiki-articles-3_4", "csv"); const DATASET_WIKI_4_4: (&str, &str) = ("smol-wiki-articles-4_4", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); +const DATASET_MOVIES_1_2: (&str, &str) = ("movies-1_2", "json"); +const DATASET_MOVIES_3_4: (&str, &str) = ("movies-3_4", "json"); +const DATASET_MOVIES_4_4: (&str, &str) = ("movies-4_4", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); const ALL_DATASETS: &[(&str, &str)] = &[ @@ -31,6 +34,9 @@ const ALL_DATASETS: &[(&str, &str)] = &[ DATASET_WIKI_3_4, DATASET_WIKI_4_4, DATASET_MOVIES, + DATASET_MOVIES_1_2, + DATASET_MOVIES_3_4, + DATASET_MOVIES_4_4, DATASET_GEO, ]; From acfc96525cc975dd1ad932a80a1e6d2df23bc9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Feb 2022 17:39:24 +0100 Subject: [PATCH 4/4] Apply GitHub suggestions --- benchmarks/benches/indexing.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index b9ad7cad9..ee74f2a80 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -116,7 +116,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = @@ -318,7 +318,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; @@ -443,7 +443,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder =