Merge pull request #494 from meilisearch/flatten-what-is-needed

Only flatten the required objects
2024-11-30 00:55:00 +08:00 · 2022-04-14 11:43:28 +02:00 · 2022-04-14 11:43:28 +02:00 · d362278a41
commit d362278a41
parent 7791ef90e7 00f78d6b5a
8 changed files with 251 additions and 9 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [workspace]
 resolver = "2"
-members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
+members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"]
 default-members = ["milli"]

 [profile.dev]
--- a/json-depth-checker/Cargo.toml
+++ b/json-depth-checker/Cargo.toml
@ -0,0 +1,16 @@
+[package]
+name = "json-depth-checker"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde_json = "1.0"
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "depth"
+harness = false
--- a/json-depth-checker/benches/depth.rs
+++ b/json-depth-checker/benches/depth.rs
@ -0,0 +1,59 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use json_depth_checker::should_flatten_from_unchecked_slice;
+use serde_json::json;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let null = serde_json::to_vec(&json!(null)).unwrap();
+    let bool_true = serde_json::to_vec(&json!(true)).unwrap();
+    let bool_false = serde_json::to_vec(&json!(false)).unwrap();
+    let integer = serde_json::to_vec(&json!(42)).unwrap();
+    let float = serde_json::to_vec(&json!(1456.258)).unwrap();
+    let string = serde_json::to_vec(&json!("hello world")).unwrap();
+    let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap();
+    let complex_object = serde_json::to_vec(&json!({
+        "doggos": [
+            { "bernard": true },
+            { "michel": 42 },
+            false,
+        ],
+        "bouvier": true,
+        "caniche": null,
+    }))
+    .unwrap();
+    let simple_array = serde_json::to_vec(&json!([
+        1,
+        2,
+        3,
+        "viva",
+        "l\"algeria",
+        true,
+        "[array]",
+        "escaped string \""
+    ]))
+    .unwrap();
+    let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap();
+    let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap();
+
+    c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null)));
+    c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true)));
+    c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false)));
+    c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer)));
+    c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float)));
+    c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string)));
+    c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object)));
+    c.bench_function("complex object", |b| {
+        b.iter(|| should_flatten_from_unchecked_slice(&complex_object))
+    });
+    c.bench_function("simple array", |b| {
+        b.iter(|| should_flatten_from_unchecked_slice(&simple_array))
+    });
+    c.bench_function("array of array", |b| {
+        b.iter(|| should_flatten_from_unchecked_slice(&array_of_array))
+    });
+    c.bench_function("array of object", |b| {
+        b.iter(|| should_flatten_from_unchecked_slice(&array_of_object))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/json-depth-checker/fuzz/Cargo.toml
+++ b/json-depth-checker/fuzz/Cargo.toml
@ -0,0 +1,27 @@
+[package]
+name = "json-depth-checker"
+version = "0.0.0"
+authors = ["Automatically generated"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+arbitrary-json = "0.1.1"
+serde_json = "1.0.79"
+
+[dependencies.json-depth-checker]
+path = ".."
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "depth"
+path = "fuzz_targets/depth.rs"
+test = false
+doc = false
--- a/json-depth-checker/fuzz/fuzz_targets/depth.rs
+++ b/json-depth-checker/fuzz/fuzz_targets/depth.rs
@ -0,0 +1,13 @@
+#![no_main]
+use arbitrary_json::ArbitraryValue;
+use json_depth_checker::*;
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|value: ArbitraryValue| {
+    let value = serde_json::Value::from(value);
+    let left = should_flatten_from_value(&value);
+    let value = serde_json::to_vec(&value).unwrap();
+    let right = should_flatten_from_unchecked_slice(&value);
+
+    assert_eq!(left, right);
+});
--- a/json-depth-checker/src/lib.rs
+++ b/json-depth-checker/src/lib.rs
@ -0,0 +1,114 @@
+use serde_json::Value;
+
+/// Your json MUST BE valid and generated by `serde_json::to_vec` before being
+/// sent in this function. This function is DUMB and FAST but makes a lot of
+/// asumption about the way `serde_json` will generate its input.
+///
+/// Will return `true` if the JSON contains an object, an array of array
+/// or an array containing an object. Returns `false` for everything else.
+pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool {
+    if json.is_empty() {
+        return false;
+    }
+
+    // since the json we receive has been generated by serde_json we know
+    // it doesn't contains any whitespace at the beginning thus we can check
+    // directly if we're looking at an object.
+    if json[0] == b'{' {
+        return true;
+    } else if json[0] != b'[' {
+        // if the json isn't an object or an array it means it's a simple value.
+        return false;
+    }
+
+    // The array case is a little bit more complex. We are looking for a second
+    // `[` but we need to ensure that it doesn't appear inside of a string. Thus
+    // we need to keep track of if we're in a string or not.
+
+    // will be used when we met a `\` to skip the next character.
+    let mut skip_next = false;
+    let mut in_string = false;
+
+    for byte in json.iter().skip(1) {
+        match byte {
+            // handle the backlash.
+            _ if skip_next => skip_next = false,
+            b'\\' => skip_next = true,
+
+            // handle the strings.
+            byte if in_string => {
+                if *byte == b'"' {
+                    in_string = false;
+                }
+            }
+            b'"' => in_string = true,
+
+            // handle the arrays.
+            b'[' => return true,
+            // since we know the json is valid we don't need to ensure the
+            // array is correctly closed
+
+            // handle the objects.
+            b'{' => return true,
+
+            // ignore everything else
+            _ => (),
+        }
+    }
+
+    false
+}
+
+/// Consider using [`should_flatten_from_unchecked_slice`] when you can.
+/// Will returns `true` if the json contains an object, an array of array
+/// or an array containing an object.
+/// Returns `false` for everything else.
+/// This function has been written to test the [`should_flatten_from_unchecked_slice`].
+pub fn should_flatten_from_value(json: &Value) -> bool {
+    match json {
+        Value::Object(..) => true,
+        Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()),
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use serde_json::*;
+
+    use super::*;
+
+    #[test]
+    fn test_shouldnt_flatten() {
+        let shouldnt_flatten = vec![
+            json!(null),
+            json!(true),
+            json!(false),
+            json!("a superb string"),
+            json!("a string escaping other \"string\""),
+            json!([null, true, false]),
+            json!(["hello", "world", "!"]),
+            json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]),
+        ];
+        for value in shouldnt_flatten {
+            assert!(!should_flatten_from_value(&value));
+            let value = serde_json::to_vec(&value).unwrap();
+            assert!(!should_flatten_from_unchecked_slice(&value));
+        }
+    }
+
+    #[test]
+    fn test_should_flatten() {
+        let should_flatten = vec![
+            json!({}),
+            json!({ "hello": "world" }),
+            json!(["hello", ["world"]]),
+            json!([true, true, true, true, true, true, true, true, true, {}]),
+        ];
+        for value in should_flatten {
+            assert!(should_flatten_from_value(&value));
+            let value = serde_json::to_vec(&value).unwrap();
+            assert!(should_flatten_from_unchecked_slice(&value));
+        }
+    }
+}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
 grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
 geoutils = "0.4.1"
 heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
+json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
 memmap2 = "0.5.3"
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> {
                    })?;

                self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
-                let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?;
-
-                self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
+                match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
+                    Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
+                    None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
+                }
            } else {
                self.new_documents_ids.insert(docid);
            }
@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> {
            if let Some(flatten) = flattened_document {
                self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
            } else {
-                let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?;
-                self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
+                match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
+                    Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
+                    None => {
+                        self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
+                    }
+                }
            }

            progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> {
    }

    // Flatten a document from the fields ids map contained in self and insert the new
-    // created fields.
-    fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> {
+    // created fields. Returns `None` if the document doesn't need to be flattened.
+    fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
+        if obkv
+            .iter()
+            .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
+        {
+            return Ok(None);
+        }
+
        let mut doc = serde_json::Map::new();

        for (k, v) in obkv.iter() {
@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            writer.insert(fid, &value)?;
        }

-        Ok(buffer)
+        Ok(Some(buffer))
    }

    // Flatten a document from a field mapping generated by [create_fields_mapping]