fix all benchmarks and add the compile time checking of the benhcmarks in the ci

2025-03-03 04:14:15 +08:00 · 2021-09-22 12:10:21 +02:00 · 2021-09-22 12:10:21 +02:00 · 176160d32f
commit 176160d32f
parent fe9f380993
5 changed files with 94 additions and 58 deletions
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@ -33,7 +33,7 @@ jobs:
      uses: actions-rs/cargo@v1
      with:
        command: check
-        args: --all
+        args: --workspace --all-targets
    - name: Run cargo test
      uses: actions-rs/cargo@v1
      with:
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -6,6 +6,9 @@ publish = false

 [dependencies]
 milli = { path = "../milli" }
+anyhow = "1.0"
+serde_json = { version = "1.0.62", features = ["preserve_order"] }
+csv = "1.1.6"

 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = "0.3.2"
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -1,11 +1,12 @@
 mod datasets_paths;
+mod utils;

-use std::fs::{create_dir_all, remove_dir_all, File};
+use std::fs::{create_dir_all, remove_dir_all};
 use std::path::Path;

 use criterion::{criterion_group, criterion_main, Criterion};
 use heed::EnvOpenOptions;
-use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
+use milli::update::UpdateBuilder;
 use milli::Index;

 #[cfg(target_os = "linux")]
@ -67,15 +68,10 @@ fn indexing_songs_default(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);

-                builder.update_format(UpdateFormat::Csv);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
-                    "could not find the dataset in: {}",
-                    datasets_paths::SMOL_SONGS
-                ));
-                builder.execute(reader, |_, _| ()).unwrap();
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
+                builder.execute(documents, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
@ -118,15 +114,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);

-                builder.update_format(UpdateFormat::Csv);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
-                    "could not find the dataset in: {}",
-                    datasets_paths::SMOL_SONGS
-                ));
-                builder.execute(reader, |_, _| ()).unwrap();
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
+                builder.execute(documents, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
@ -165,15 +156,10 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);

-                builder.update_format(UpdateFormat::Csv);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
-                    "could not find the dataset in: {}",
-                    datasets_paths::SMOL_SONGS
-                ));
-                builder.execute(reader, |_, _| ()).unwrap();
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
+                builder.execute(documents, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
@ -211,15 +197,10 @@ fn indexing_wiki(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);

-                builder.update_format(UpdateFormat::Csv);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
-                    "could not find the dataset in: {}",
-                    datasets_paths::SMOL_SONGS
-                ));
-                builder.execute(reader, |_, _| ()).unwrap();
+                let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
+                builder.execute(documents, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
@ -262,13 +243,10 @@ fn indexing_movies_default(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);

-                builder.update_format(UpdateFormat::Json);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::MOVIES)
-                    .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
-                builder.execute(reader, |_, _| ()).unwrap();
+                let documents = utils::documents_from(datasets_paths::MOVIES, "json");
+                builder.execute(documents, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
@ -316,15 +294,11 @@ fn indexing_geo(c: &mut Criterion) {
            move |index| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index.write_txn().unwrap();
-                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+                let builder = update_builder.index_documents(&mut wtxn, &index);
+
+                let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
+                builder.execute(documents, |_, _| ()).unwrap();

-                builder.update_format(UpdateFormat::JsonStream);
-                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-                let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!(
-                    "could not find the dataset in: {}",
-                    datasets_paths::SMOL_ALL_COUNTRIES
-                ));
-                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();

                index.prepare_for_closing().wait();
--- a/benchmarks/benches/search_geo.rs
+++ b/benchmarks/benches/search_geo.rs
@ -2,7 +2,7 @@ mod datasets_paths;
 mod utils;

 use criterion::{criterion_group, criterion_main};
-use milli::update::{Settings, UpdateFormat};
+use milli::update::Settings;
 use utils::Conf;

 #[cfg(target_os = "linux")]
@ -33,7 +33,7 @@ fn base_conf(builder: &mut Settings) {
 #[rustfmt::skip]
 const BASE_CONF: Conf = Conf {
    dataset: datasets_paths::SMOL_ALL_COUNTRIES,
-    dataset_format: UpdateFormat::JsonStream,
+    dataset_format: "jsonl",
    queries: &[
        "",
    ],
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -1,10 +1,15 @@
+#![allow(dead_code)]
+
 use std::fs::{create_dir_all, remove_dir_all, File};
+use std::io::{self, Cursor, Read, Seek};
 use std::path::Path;

 use criterion::BenchmarkId;
 use heed::EnvOpenOptions;
-use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
+use milli::documents::DocumentBatchReader;
+use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder};
 use milli::{FilterCondition, Index};
+use serde_json::{Map, Value};

 pub struct Conf<'a> {
    /// where we are going to create our database.mmdb directory
@ -13,7 +18,7 @@ pub struct Conf<'a> {
    /// the dataset to be used, it must be an uncompressed csv
    pub dataset: &'a str,
    /// The format of the dataset
-    pub dataset_format: UpdateFormat,
+    pub dataset_format: &'a str,
    pub group_name: &'a str,
    pub queries: &'a [&'a str],
    /// here you can change which criterion are used and in which order.
@ -33,7 +38,7 @@ pub struct Conf<'a> {
 impl Conf<'_> {
    pub const BASE: Self = Conf {
        database_name: "benches.mmdb",
-        dataset_format: UpdateFormat::Csv,
+        dataset_format: "csv",
        dataset: "",
        group_name: "",
        queries: &[],
@ -87,11 +92,10 @@ pub fn base_setup(conf: &Conf) -> Index {
    if let None = conf.primary_key {
        builder.enable_autogenerate_docids();
    }
-    builder.update_format(conf.dataset_format);
+    let documents = documents_from(conf.dataset, conf.dataset_format);
+
    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-    let reader = File::open(conf.dataset)
-        .expect(&format!("could not find the dataset in: {}", conf.dataset));
-    builder.execute(reader, |_, _| ()).unwrap();
+    builder.execute(documents, |_, _| ()).unwrap();
    wtxn.commit().unwrap();

    index
@ -128,3 +132,58 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
        index.prepare_for_closing().wait();
    }
 }
+
+pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl Read + Seek> {
+    let reader =
+        File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
+    let documents = match filetype {
+        "csv" => documents_from_csv(reader).unwrap(),
+        "json" => documents_from_json(reader).unwrap(),
+        "jsonl" => documents_from_jsonl(reader).unwrap(),
+        otherwise => panic!("invalid update format {:?}", otherwise),
+    };
+    DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
+}
+
+fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+    let mut writer = Cursor::new(Vec::new());
+    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+
+    let values = serde_json::Deserializer::from_reader(reader)
+        .into_iter::<serde_json::Map<String, serde_json::Value>>();
+    for document in values {
+        let document = document?;
+        documents.add_documents(document)?;
+    }
+    documents.finish()?;
+
+    Ok(writer.into_inner())
+}
+
+fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+    let mut writer = Cursor::new(Vec::new());
+    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+
+    let json: serde_json::Value = serde_json::from_reader(reader)?;
+    documents.add_documents(json)?;
+    documents.finish()?;
+
+    Ok(writer.into_inner())
+}
+
+fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+    let mut writer = Cursor::new(Vec::new());
+    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+
+    let mut records = csv::Reader::from_reader(reader);
+    let iter = records.deserialize::<Map<String, Value>>();
+
+    for doc in iter {
+        let doc = doc?;
+        documents.add_documents(doc)?;
+    }
+
+    documents.finish()?;
+
+    Ok(writer.into_inner())
+}