From e1053989c016f66b14f26997d0ff44b0331007b4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 12 Jan 2022 17:57:54 +0100 Subject: [PATCH 1/4] add a fuzzer on milli --- milli/README.md | 26 ++++++++++ milli/fuzz/.gitignore | 2 + milli/fuzz/Cargo.toml | 36 ++++++++++++++ milli/fuzz/fuzz_targets/indexing.rs | 76 +++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 milli/README.md create mode 100644 milli/fuzz/.gitignore create mode 100644 milli/fuzz/Cargo.toml create mode 100644 milli/fuzz/fuzz_targets/indexing.rs diff --git a/milli/README.md b/milli/README.md new file mode 100644 index 000000000..7479eff45 --- /dev/null +++ b/milli/README.md @@ -0,0 +1,26 @@ +# Milli + +## Fuzzing milli + +Currently you can only fuzz the indexation. +To execute the fuzzer run: +``` +cargo fuzz run indexing +``` + +To execute the fuzzer on multiple thread you can also run: +``` +cargo fuzz run -j4 indexing +``` + +Since the fuzzer is going to create a lot of temporary file to let milli index its documents +I would also recommand to execute it on a ramdisk. +Here is how to setup a ramdisk on linux: +``` +sudo mount -t tmpfs none path/to/your/ramdisk +``` +And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable +to make the fuzzer create its file in it: +``` +export TMPDIR=path/to/your/ramdisk +``` diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore new file mode 100644 index 000000000..cb73742e4 --- /dev/null +++ b/milli/fuzz/.gitignore @@ -0,0 +1,2 @@ +/corpus/ +/artifacts/ diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml new file mode 100644 index 000000000..04b329600 --- /dev/null +++ b/milli/fuzz/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "milli-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +serde_json = { version = "1.0.62", features = ["preserve_order"] } +anyhow = "1.0" +tempfile = "3.3" +arbitrary-json = { path = "../../../arbitrary-json" } + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" + +[dependencies.milli] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[profile.release] +debug = true + +[[bin]] +name = "indexing" +path = "fuzz_targets/indexing.rs" +test = false +doc = false diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs new file mode 100644 index 000000000..179ccf757 --- /dev/null +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -0,0 +1,76 @@ +#![no_main] + +use std::io::{BufWriter, Cursor, Read, Seek, Write}; + +use anyhow::{bail, Result}; +use arbitrary_json::ArbitraryValue; +use heed::EnvOpenOptions; +use libfuzzer_sys::fuzz_target; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::UpdateBuilder; +use milli::Index; +use serde_json::Value; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +/// reads json from input and write an obkv batch to writer. +pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { + let writer = BufWriter::new(writer); + let mut builder = DocumentBatchBuilder::new(writer)?; + builder.extend_from_json(input)?; + + if builder.len() == 0 { + bail!("Empty payload"); + } + + let count = builder.finish()?; + + Ok(count) +} + +fn index_documents( + index: &mut milli::Index, + documents: DocumentBatchReader>>, +) -> Result<()> { + let update_builder = UpdateBuilder::new(); + let mut wtxn = index.write_txn()?; + let builder = update_builder.index_documents(&mut wtxn, &index); + + builder.execute(documents, |_| ())?; + wtxn.commit()?; + Ok(()) +} + +fn create_index() -> Result { + let dir = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(1); + Ok(Index::new(options, dir.path())?) +} + +fuzz_target!(|batches: Vec>| { + if let Ok(mut index) = create_index() { + for batch in batches { + let documents: Vec = + batch.into_iter().map(|value| serde_json::Value::from(value)).collect(); + let json = Value::Array(documents); + let json = serde_json::to_string(&json).unwrap(); + + let mut documents = Cursor::new(Vec::new()); + + // We ignore all badly generated documents + if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + let documents = DocumentBatchReader::from_reader(documents).unwrap(); + match index_documents(&mut index, documents) { + // Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e), + _ => (), + } + } + } + + index.prepare_for_closing().wait(); + } +}); From c94952e25d33b3750da86aa00bede852133a1915 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 12 Jan 2022 18:30:11 +0100 Subject: [PATCH 2/4] update the readme + dependencies --- milli/README.md | 4 ++-- milli/fuzz/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/README.md b/milli/README.md index 7479eff45..56db42a86 100644 --- a/milli/README.md +++ b/milli/README.md @@ -5,12 +5,12 @@ Currently you can only fuzz the indexation. To execute the fuzzer run: ``` -cargo fuzz run indexing +cargo +nightly fuzz run indexing ``` To execute the fuzzer on multiple thread you can also run: ``` -cargo fuzz run -j4 indexing +cargo +nightly fuzz run -j4 indexing ``` Since the fuzzer is going to create a lot of temporary file to let milli index its documents diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 04b329600..3386ddaf9 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -14,7 +14,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" -arbitrary-json = { path = "../../../arbitrary-json" } +arbitrary-json = { git = "https://github.com/irevoire/arbitrary-json" } [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" From b22c80106f6766f413e16f1576a0c532b7c120ce Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 13 Jan 2022 15:35:24 +0100 Subject: [PATCH 3/4] add some settings to the fuzzed milli and use the published version of arbitrary json --- milli/fuzz/Cargo.toml | 3 ++- milli/fuzz/fuzz_targets/indexing.rs | 36 ++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 3386ddaf9..0456e7098 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -9,12 +9,13 @@ edition = "2018" cargo-fuzz = true [dependencies] +arbitrary = "1.0" libfuzzer-sys = "0.4" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" -arbitrary-json = { git = "https://github.com/irevoire/arbitrary-json" } +arbitrary-json = "0.1.0" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 179ccf757..9b63983fb 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -1,5 +1,6 @@ #![no_main] +use std::collections::HashSet; use std::io::{BufWriter, Cursor, Read, Seek, Write}; use anyhow::{bail, Result}; @@ -46,9 +47,38 @@ fn index_documents( fn create_index() -> Result { let dir = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.map_size(10 * 1024 * 1024 * 1024); // 10 GB options.max_readers(1); - Ok(Index::new(options, dir.path())?) + let index = Index::new(options, dir.path())?; + + let update_builder = UpdateBuilder::new(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + let displayed_fields = + ["id", "title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields: HashSet = + ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields.clone()); + builder.set_sortable_fields(faceted_fields); + + builder.set_distinct_field("same".to_string()); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + Ok(index) } fuzz_target!(|batches: Vec>| { @@ -63,9 +93,9 @@ fuzz_target!(|batches: Vec>| { // We ignore all badly generated documents if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + documents.rewind().unwrap(); let documents = DocumentBatchReader::from_reader(documents).unwrap(); match index_documents(&mut index, documents) { - // Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e), _ => (), } } From 0605c0ac682993233e98b70cdd553564abee7ebd Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 13 Jan 2022 18:51:08 +0100 Subject: [PATCH 4/4] apply review comments --- milli/fuzz/fuzz_targets/indexing.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 9b63983fb..327df09d1 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -91,13 +91,13 @@ fuzz_target!(|batches: Vec>| { let mut documents = Cursor::new(Vec::new()); - // We ignore all badly generated documents - if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + // We ignore all malformed documents + if let Ok(_) = read_json(json.as_bytes(), &mut documents) { documents.rewind().unwrap(); let documents = DocumentBatchReader::from_reader(documents).unwrap(); - match index_documents(&mut index, documents) { - _ => (), - } + // A lot of errors can come out of milli and we don't know which ones are normal or not + // so we are only going to look for the unexpected panics. + let _ = index_documents(&mut index, documents); } }