diff --git a/Cargo.toml b/Cargo.toml index a9378adc4..506fd3dc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml new file mode 100644 index 000000000..d608a49dc --- /dev/null +++ b/json-depth-checker/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "json-depth-checker" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde_json = "1.0" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "depth" +harness = false diff --git a/json-depth-checker/benches/depth.rs b/json-depth-checker/benches/depth.rs new file mode 100644 index 000000000..e11bc1a68 --- /dev/null +++ b/json-depth-checker/benches/depth.rs @@ -0,0 +1,59 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use json_depth_checker::should_flatten_from_unchecked_slice; +use serde_json::json; + +fn criterion_benchmark(c: &mut Criterion) { + let null = serde_json::to_vec(&json!(null)).unwrap(); + let bool_true = serde_json::to_vec(&json!(true)).unwrap(); + let bool_false = serde_json::to_vec(&json!(false)).unwrap(); + let integer = serde_json::to_vec(&json!(42)).unwrap(); + let float = serde_json::to_vec(&json!(1456.258)).unwrap(); + let string = serde_json::to_vec(&json!("hello world")).unwrap(); + let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap(); + let complex_object = serde_json::to_vec(&json!({ + "doggos": [ + { "bernard": true }, + { "michel": 42 }, + false, + ], + "bouvier": true, + "caniche": null, + })) + .unwrap(); + let simple_array = serde_json::to_vec(&json!([ + 1, + 2, + 3, + "viva", + "l\"algeria", + true, + "[array]", + "escaped string \"" + ])) + .unwrap(); + let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap(); + let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap(); + + c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null))); + c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true))); + c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false))); + c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer))); + c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float))); + c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string))); + c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object))); + c.bench_function("complex object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&complex_object)) + }); + c.bench_function("simple array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&simple_array)) + }); + c.bench_function("array of array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_array)) + }); + c.bench_function("array of object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_object)) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/json-depth-checker/fuzz/Cargo.toml b/json-depth-checker/fuzz/Cargo.toml new file mode 100644 index 000000000..e36657ec2 --- /dev/null +++ b/json-depth-checker/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "json-depth-checker" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" +serde_json = "1.0.79" + +[dependencies.json-depth-checker] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "depth" +path = "fuzz_targets/depth.rs" +test = false +doc = false diff --git a/json-depth-checker/fuzz/fuzz_targets/depth.rs b/json-depth-checker/fuzz/fuzz_targets/depth.rs new file mode 100644 index 000000000..6c3a6efe7 --- /dev/null +++ b/json-depth-checker/fuzz/fuzz_targets/depth.rs @@ -0,0 +1,13 @@ +#![no_main] +use arbitrary_json::ArbitraryValue; +use json_depth_checker::*; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|value: ArbitraryValue| { + let value = serde_json::Value::from(value); + let left = should_flatten_from_value(&value); + let value = serde_json::to_vec(&value).unwrap(); + let right = should_flatten_from_unchecked_slice(&value); + + assert_eq!(left, right); +}); diff --git a/json-depth-checker/src/lib.rs b/json-depth-checker/src/lib.rs new file mode 100644 index 000000000..3d0f28af8 --- /dev/null +++ b/json-depth-checker/src/lib.rs @@ -0,0 +1,114 @@ +use serde_json::Value; + +/// Your json MUST BE valid and generated by `serde_json::to_vec` before being +/// sent in this function. This function is DUMB and FAST but makes a lot of +/// asumption about the way `serde_json` will generate its input. +/// +/// Will return `true` if the JSON contains an object, an array of array +/// or an array containing an object. Returns `false` for everything else. +pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { + if json.is_empty() { + return false; + } + + // since the json we receive has been generated by serde_json we know + // it doesn't contains any whitespace at the beginning thus we can check + // directly if we're looking at an object. + if json[0] == b'{' { + return true; + } else if json[0] != b'[' { + // if the json isn't an object or an array it means it's a simple value. + return false; + } + + // The array case is a little bit more complex. We are looking for a second + // `[` but we need to ensure that it doesn't appear inside of a string. Thus + // we need to keep track of if we're in a string or not. + + // will be used when we met a `\` to skip the next character. + let mut skip_next = false; + let mut in_string = false; + + for byte in json.iter().skip(1) { + match byte { + // handle the backlash. + _ if skip_next => skip_next = false, + b'\\' => skip_next = true, + + // handle the strings. + byte if in_string => { + if *byte == b'"' { + in_string = false; + } + } + b'"' => in_string = true, + + // handle the arrays. + b'[' => return true, + // since we know the json is valid we don't need to ensure the + // array is correctly closed + + // handle the objects. + b'{' => return true, + + // ignore everything else + _ => (), + } + } + + false +} + +/// Consider using [`should_flatten_from_unchecked_slice`] when you can. +/// Will returns `true` if the json contains an object, an array of array +/// or an array containing an object. +/// Returns `false` for everything else. +/// This function has been written to test the [`should_flatten_from_unchecked_slice`]. +pub fn should_flatten_from_value(json: &Value) -> bool { + match json { + Value::Object(..) => true, + Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use serde_json::*; + + use super::*; + + #[test] + fn test_shouldnt_flatten() { + let shouldnt_flatten = vec![ + json!(null), + json!(true), + json!(false), + json!("a superb string"), + json!("a string escaping other \"string\""), + json!([null, true, false]), + json!(["hello", "world", "!"]), + json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]), + ]; + for value in shouldnt_flatten { + assert!(!should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(!should_flatten_from_unchecked_slice(&value)); + } + } + + #[test] + fn test_should_flatten() { + let should_flatten = vec![ + json!({}), + json!({ "hello": "world" }), + json!(["hello", ["world"]]), + json!([true, true, true, true, true, true, true, true, true, {}]), + ]; + for value in should_flatten { + assert!(should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(should_flatten_from_unchecked_slice(&value)); + } + } +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1295c4384..64497bc13 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } memmap2 = "0.5.3" diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index cbb6ed428..c215872ca 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> { })?; self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; - let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; - - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, + } } else { self.new_documents_ids.insert(docid); } @@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> { if let Some(flatten) = flattened_document { self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; } else { - let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => { + self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? + } + } } progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> { } // Flatten a document from the fields ids map contained in self and insert the new - // created fields. - fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result> { + // created fields. Returns `None` if the document doesn't need to be flattened. + fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { + if obkv + .iter() + .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) + { + return Ok(None); + } + let mut doc = serde_json::Map::new(); for (k, v) in obkv.iter() { @@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> { writer.insert(fid, &value)?; } - Ok(buffer) + Ok(Some(buffer)) } // Flatten a document from a field mapping generated by [create_fields_mapping]