From 5e683ba472e00421594658a2a5c90c73f26b8514 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 18:08:28 +0200 Subject: [PATCH 1/2] add benchmarks for the geosearch --- .github/workflows/benchmarks.yml | 2 +- benchmarks/Cargo.toml | 4 + benchmarks/README.md | 32 ++++++-- benchmarks/benches/indexing.rs | 59 ++++++++++++++- benchmarks/benches/search_geo.rs | 123 +++++++++++++++++++++++++++++++ benchmarks/benches/utils.rs | 11 ++- benchmarks/build.rs | 3 +- 7 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 benchmarks/benches/search_geo.rs diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index c64c6a64b..7a9fbb5de 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (search_songs, search_wiki or indexing)' + description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)' required: false default: 'search_songs' diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 9e380b9a8..b598f2f6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -29,6 +29,10 @@ harness = false name = "search_wiki" harness = false +[[bench]] +name = "search_geo" +harness = false + [[bench]] name = "indexing" harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md index 16838e488..7a387dfdd 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -36,7 +36,7 @@ To run all the benchmarks (~5h): cargo bench ``` -To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark: +To run only the `search_songs` (~1h), `search_wiki` (~3h), `search_geo` (~20m) or `indexing` (~2h) benchmark: ```bash cargo bench --bench @@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th ```bash mkdir ~/datasets -MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench search_songs # the four datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` @@ -81,14 +81,15 @@ Run the comparison script: ## Datasets -The benchmarks are available for the following datasets: -- `songs` -- `wiki` +The benchmarks uses the following datasets: +- `smol-songs` +- `smol-wiki` - `movies` +- `smol-all-countries` ### Songs -`songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). +`smol-songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). It was generated with this command: @@ -96,11 +97,11 @@ It was generated with this command: xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -_[Download the generated `songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ +_[Download the generated `smol-songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ ### Wiki -`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). +`smol-wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). It was generated with the following command: @@ -108,9 +109,24 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` +_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki.csv.gz)._ + ### Movies `movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/) _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ + +### All Countries + +`smol-all-countries` is a subset of the [`all-countries.csv` dataset]() +It has been converted to jsonlines and then edited so it matches our format for the `_geo` field. + +It was generated with the following command: +```bash +bat all-countries.csv.gz | gunzip | xsv sample --seed 42 1000000 | csv2json-lite | sd '"latitude":"(.*?)","longitude":"(.*?)"' '"_geo": { "lat": $1, "lng": $2 }' | sd '\[|\]|,$' '' | gzip > smol-all-countries.jsonl.gz +``` + +_[Download the `smol-all-countries` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-all-countries.jsonl.gz)._ + diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index bd056ea23..30532aef8 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -277,12 +277,69 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn indexing_geo(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing geo_point", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("geonameid".to_owned()); + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + index + }, + move |index| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + + builder.update_format(UpdateFormat::JsonStream); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_ALL_COUNTRIES + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + criterion_group!( benches, indexing_songs_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, indexing_wiki, - indexing_movies_default + indexing_movies_default, + indexing_geo ); criterion_main!(benches); diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs new file mode 100644 index 000000000..1432f691b --- /dev/null +++ b/benchmarks/benches/search_geo.rs @@ -0,0 +1,123 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::{Settings, UpdateFormat}; +use utils::Conf; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: UpdateFormat::JsonStream, + queries: &[ + "", + ], + configure: base_conf, + primary_key: Some("geonameid"), + ..Conf::BASE +}; + +fn bench_geo(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + // A basic placeholder with no geo + utils::Conf { + group_name: "placeholder with no geo", + ..BASE_CONF + }, + // Medium aglomeration: probably the most common usecase + utils::Conf { + group_name: "asc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]), + ..BASE_CONF + }, + // Big agglomeration: a lot of documents close to our point + utils::Conf { + group_name: "asc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]), + ..BASE_CONF + }, + // The furthest point from any civilization + utils::Conf { + group_name: "asc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]), + ..BASE_CONF + }, + // Filters + utils::Conf { + group_name: "filter of 100km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_geo); +criterion_main!(benches); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 5318527f4..72eac59d9 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -12,6 +12,8 @@ pub struct Conf<'a> { pub database_name: &'a str, /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, + /// The format of the dataset + pub dataset_format: UpdateFormat, pub group_name: &'a str, pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. @@ -21,6 +23,7 @@ pub struct Conf<'a> { /// the last chance to configure your database as you want pub configure: fn(&mut Settings), pub filter: Option<&'a str>, + pub sort: Option>, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -30,12 +33,14 @@ pub struct Conf<'a> { impl Conf<'_> { pub const BASE: Self = Conf { database_name: "benches.mmdb", + dataset_format: UpdateFormat::Csv, dataset: "", group_name: "", queries: &[], criterion: None, configure: |_| (), filter: None, + sort: None, optional_words: true, primary_key: None, }; @@ -82,7 +87,7 @@ pub fn base_setup(conf: &Conf) -> Index { if let None = conf.primary_key { builder.enable_autogenerate_docids(); } - builder.update_format(UpdateFormat::Csv); + builder.update_format(conf.dataset_format); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = File::open(conf.dataset) .expect(&format!("could not find the dataset in: {}", conf.dataset)); @@ -110,6 +115,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap(); search.filter(filter); } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } let _ids = search.execute().unwrap(); }); }); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 47a14f25b..2495930bb 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -13,6 +13,7 @@ const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/dat const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); +const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); /// The name of the environment variable used to select the path /// of the directory containing the datasets @@ -32,7 +33,7 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] { + for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] { let out_path = out_dir.join(dataset); let out_file = out_path.with_extension(extension); From 9a920d1f93ccf7afb698285289b6b56bdd54d900 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 10:37:38 +0200 Subject: [PATCH 2/2] Fix datasets links in the readme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Urquizar --- benchmarks/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 7a387dfdd..a6fdf9360 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -109,7 +109,7 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` -_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki.csv.gz)._ +_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ ### Movies @@ -120,7 +120,7 @@ _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ ### All Countries -`smol-all-countries` is a subset of the [`all-countries.csv` dataset]() +`smol-all-countries` is a subset of the [`all-countries.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/all-countries.csv.gz) It has been converted to jsonlines and then edited so it matches our format for the `_geo` field. It was generated with the following command: