uses an env variable to find the datasets

This commit is contained in:
tamo 2021-04-27 15:41:16 +02:00 committed by Tamo
parent 4969abeaab
commit 3c84075d2d
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
2 changed files with 27 additions and 2 deletions

View File

@ -13,3 +13,15 @@ You can run the following command from the root of this git repository
``` ```
wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
``` ```
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
```
MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
```
Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)

View File

@ -7,6 +7,15 @@ use milli::{
FacetCondition, Index, FacetCondition, Index,
}; };
/// The name of the environment variable used to select the path
/// of the directory containing the datasets
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
/// The default path for the dataset if nothing is specified
/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
/// executed with a pwd of `milli/milli`
const DEFAULT_DATASETS_PATH: &str = "milli/benches";
pub struct Conf<'a> { pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory /// where we are going to create our database.mmdb directory
/// each benchmark will first try to delete it and then recreate it /// each benchmark will first try to delete it and then recreate it
@ -78,7 +87,10 @@ pub fn base_setup(conf: &Conf) -> Index {
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
// we called from cargo the current directory is supposed to be milli/milli // we called from cargo the current directory is supposed to be milli/milli
let dataset_path = format!("benches/{}", conf.dataset); let base_dataset_path = std::env::vars()
.find(|var| var.0 == BASE_DATASETS_PATH_KEY)
.map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
let reader = File::open(&dataset_path) let reader = File::open(&dataset_path)
.expect(&format!("could not find the dataset in: {}", &dataset_path)); .expect(&format!("could not find the dataset in: {}", &dataset_path));
builder.execute(reader, |_, _| ()).unwrap(); builder.execute(reader, |_, _| ()).unwrap();
@ -100,7 +112,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
let mut search = index.search(&rtxn); let mut search = index.search(&rtxn);
search.query(query).optional_words(conf.optional_words); search.query(query).optional_words(conf.optional_words);
if let Some(facet_condition) = conf.facet_condition { if let Some(facet_condition) = conf.facet_condition {
let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); let facet_condition =
FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
search.facet_condition(facet_condition); search.facet_condition(facet_condition);
} }
let _ids = search.execute().unwrap(); let _ids = search.execute().unwrap();