From 87ec95f7a03a31ff4aad0959592d7b732de71d54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Dec 2018 13:04:02 +0100 Subject: [PATCH 1/2] test: Add benchmarks to mesure the database --- Cargo.toml | 2 + src/database/database.rs | 318 +++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + 3 files changed, 322 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index e15fbb6cf..5d19efca3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,8 @@ nightly = [] csv = "1.0" elapsed = "0.1" quickcheck = "0.7" +rand = "0.6" +rand_xorshift = "0.1" structopt = "0.2" tempfile = "3.0" termcolor = "1.0" diff --git a/src/database/database.rs b/src/database/database.rs index 467a1b49a..ec77a62dc 100644 --- a/src/database/database.rs +++ b/src/database/database.rs @@ -318,3 +318,321 @@ mod tests { Ok(dir.close()?) } } + +#[cfg(all(feature = "nightly", test))] +mod bench { + extern crate test; + + use super::*; + use std::error::Error; + use std::iter::repeat_with; + use self::test::Bencher; + + use rand::distributions::Alphanumeric; + use rand_xorshift::XorShiftRng; + use rand::{Rng, SeedableRng}; + use rand::seq::SliceRandom; + use serde_derive::Serialize; + + use crate::tokenizer::DefaultBuilder; + use crate::database::update::UpdateBuilder; + use crate::database::schema::*; + + fn random_sentences(number: usize, rng: &mut R) -> String { + let mut words = String::new(); + + for i in 0..number { + let word_len = rng.gen_range(1, 12); + let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len); + words.extend(iter); + + if i == number - 1 { // last word + let final_ = [".", "?", "!", "..."].choose(rng).cloned(); + words.extend(final_); + } else { + let middle = [",", ", "].choose(rng).cloned(); + words.extend(middle); + } + } + + words + } + + #[bench] + fn open_little_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..300 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + database.ingest_update_file(update)?; + + drop(database); + + bench.iter(|| { + let database = Database::open(db_path.clone()).unwrap(); + test::black_box(|| database); + }); + + Ok(()) + } + + #[bench] + fn open_medium_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..3000 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + database.ingest_update_file(update)?; + + drop(database); + + bench.iter(|| { + let database = Database::open(db_path.clone()).unwrap(); + test::black_box(|| database); + }); + + Ok(()) + } + + #[bench] + #[ignore] + fn open_big_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..30_000 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + database.ingest_update_file(update)?; + + drop(database); + + bench.iter(|| { + let database = Database::open(db_path.clone()).unwrap(); + test::black_box(|| database); + }); + + Ok(()) + } + + #[bench] + fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..300 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + let view = database.ingest_update_file(update)?; + + bench.iter(|| { + for q in &["a", "b", "c", "d", "e"] { + let documents = view.query_builder().unwrap().query(q, 0..20); + test::black_box(|| documents); + } + }); + + Ok(()) + } + + #[bench] + fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..3000 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + let view = database.ingest_update_file(update)?; + + bench.iter(|| { + for q in &["a", "b", "c", "d", "e"] { + let documents = view.query_builder().unwrap().query(q, 0..20); + test::black_box(|| documents); + } + }); + + Ok(()) + } + + #[bench] + #[ignore] + fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box> { + let dir = tempfile::tempdir()?; + + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + let schema = builder.build(); + + let db_path = dir.path().join("bench.mdb"); + let database = Database::create(db_path.clone(), schema.clone())?; + + #[derive(Serialize)] + struct Document { + id: u64, + title: String, + description: String, + } + + let path = dir.path().join("update-000.sst"); + let tokenizer_builder = DefaultBuilder; + let mut builder = UpdateBuilder::new(path, schema.clone()); + let mut rng = XorShiftRng::seed_from_u64(42); + + for i in 0..30_000 { + let document = Document { + id: i, + title: random_sentences(rng.gen_range(1, 8), &mut rng), + description: random_sentences(rng.gen_range(20, 200), &mut rng), + }; + builder.update_document(&document, &tokenizer_builder)?; + } + + let update = builder.build()?; + let view = database.ingest_update_file(update)?; + + bench.iter(|| { + for q in &["a", "b", "c", "d", "e"] { + let documents = view.query_builder().unwrap().query(q, 0..20); + test::black_box(|| documents); + } + }); + + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index ab291afa5..01b9cb85d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "nightly", feature(test))] + pub mod automaton; pub mod database; pub mod data; From dfa19582a24cd3b87ec145833694e9fce07f4856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Dec 2018 13:04:22 +0100 Subject: [PATCH 2/2] test: Add benchmarks to mesure the words proximity criterion --- src/rank/criterion/words_proximity.rs | 39 +++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index f4b3aa0cd..fc80dfaec 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -121,3 +121,42 @@ mod tests { assert_eq!(matches_proximity(matches), 3); } } + +#[cfg(all(feature = "nightly", test))] +mod bench { + extern crate test; + + use super::*; + use std::error::Error; + use self::test::Bencher; + + use rand_xorshift::XorShiftRng; + use rand::{Rng, SeedableRng}; + + use crate::Attribute; + + #[bench] + fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box> { + let number_matches = 30_000; + let mut matches = Vec::with_capacity(number_matches); + let mut rng = XorShiftRng::seed_from_u64(42); + + for _ in 0..number_matches { + let query_index = rng.gen_range(0, 4); + + let attribute = rng.gen_range(0, 5); + let word_index = rng.gen_range(0, 15); + let attribute = Attribute::new_faillible(attribute, word_index); + + let match_ = Match { query_index, attribute, ..Match::zero() }; + matches.push(match_); + } + + bench.iter(|| { + let proximity = matches_proximity(&matches); + test::black_box(move || proximity) + }); + + Ok(()) + } +}