From bab898ce8607468dd6ec17f095b8cdbd8f1133a0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Apr 2022 18:20:44 +0200 Subject: [PATCH] move the flatten-serde-json crate inside of milli --- Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 15 + flatten-serde-json/README.md | 153 ++++++++++ flatten-serde-json/fuzz/Cargo.toml | 26 ++ .../fuzz/fuzz_targets/flatten.rs | 8 + flatten-serde-json/src/lib.rs | 264 ++++++++++++++++++ flatten-serde-json/src/main.rs | 11 + milli/Cargo.toml | 2 +- 8 files changed, 479 insertions(+), 2 deletions(-) create mode 100644 flatten-serde-json/Cargo.toml create mode 100644 flatten-serde-json/README.md create mode 100644 flatten-serde-json/fuzz/Cargo.toml create mode 100644 flatten-serde-json/fuzz/fuzz_targets/flatten.rs create mode 100644 flatten-serde-json/src/lib.rs create mode 100644 flatten-serde-json/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 3f2732444..a9378adc4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml new file mode 100644 index 000000000..db92c1ded --- /dev/null +++ b/flatten-serde-json/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "flatten-serde-json" +version = "0.1.0" +edition = "2021" +description = "Flatten serde-json objects like elastic search" +readme = "README.md" +author = ["Tamo tamo@meilisearch.com"] +repository = "https://github.com/irevoire/flatten-serde-json" +keywords = ["json", "flatten"] +categories = ["command-line-utilities"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde_json = "1.0" diff --git a/flatten-serde-json/README.md b/flatten-serde-json/README.md new file mode 100644 index 000000000..a1dd7d275 --- /dev/null +++ b/flatten-serde-json/README.md @@ -0,0 +1,153 @@ +# Flatten serde Json + +This crate flatten [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) `Object` in a format +similar to [elastic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html). + +## Examples + +### There is nothing to do + +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +Flattens to: +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +------------ + +### Objects + +```json +{ + "a": { + "b": "c", + "d": "e", + "f": "g" + } +} +``` + +Flattens to: +```json +{ + "a.b": "c", + "a.d": "e", + "a.f": "g" +} +``` + +------------ + +### Array of objects + +```json +{ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects with normal value in the array + +```json +{ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a": 42, + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects of array of objects of ... + +```json +{ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] +} +``` + +Flattens to: +```json +{ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", +} +``` + +------------ + +### Collision between a generated field name and an already existing field + +```json +{ + "a": { + "b": "c", + }, + "a.b": "d", +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d"], +} +``` + diff --git a/flatten-serde-json/fuzz/Cargo.toml b/flatten-serde-json/fuzz/Cargo.toml new file mode 100644 index 000000000..2e0510d5f --- /dev/null +++ b/flatten-serde-json/fuzz/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "flatten_serde_json-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" + +[dependencies.flatten_serde_json] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "flatten" +path = "fuzz_targets/flatten.rs" +test = false +doc = false diff --git a/flatten-serde-json/fuzz/fuzz_targets/flatten.rs b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs new file mode 100644 index 000000000..399d1c484 --- /dev/null +++ b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs @@ -0,0 +1,8 @@ +#![no_main] +use arbitrary_json::ArbitraryObject; +use flatten_serde_json::flatten; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|object: ArbitraryObject| { + let _ = flatten(&object); +}); diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs new file mode 100644 index 000000000..734ae2a24 --- /dev/null +++ b/flatten-serde-json/src/lib.rs @@ -0,0 +1,264 @@ +#![doc = include_str!("../README.md")] + +use serde_json::{json, Map, Value}; + +pub fn flatten(json: &Map) -> Map { + let mut obj = Map::new(); + insert_object(&mut obj, None, json); + obj +} + +fn insert_object( + base_json: &mut Map, + base_key: Option<&str>, + object: &Map, +) { + for (key, value) in object { + let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); + + if let Some(array) = value.as_array() { + insert_array(base_json, &new_key, array); + } else if let Some(object) = value.as_object() { + insert_object(base_json, Some(&new_key), object); + } else { + insert_value(base_json, &new_key, value.clone()); + } + } +} + +fn insert_array(base_json: &mut Map, base_key: &str, array: &Vec) { + for value in array { + if let Some(object) = value.as_object() { + insert_object(base_json, Some(base_key), object); + } else if let Some(sub_array) = value.as_array() { + insert_array(base_json, base_key, sub_array); + } else { + insert_value(base_json, base_key, value.clone()); + } + } +} + +fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) { + debug_assert!(!to_insert.is_object()); + debug_assert!(!to_insert.is_array()); + + // does the field aleardy exists? + if let Some(value) = base_json.get_mut(key) { + // is it already an array + if let Some(array) = value.as_array_mut() { + array.push(to_insert); + // or is there a collision + } else { + let value = std::mem::take(value); + base_json[key] = json!([value, to_insert]); + } + // if it does not exist we can push the value untouched + } else { + base_json.insert(key.to_string(), json!(to_insert)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn no_flattening() { + let mut base: Value = json!({ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!( + "got:\n{}\nexpected:\n{}\n", + serde_json::to_string_pretty(&flat).unwrap(), + serde_json::to_string_pretty(&json).unwrap() + ); + + assert_eq!(flat, json); + } + + #[test] + fn flatten_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + "d": "e", + "f": "g" + } + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": "c", + "a.d": "e", + "a.f": "g" + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_array() { + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": 42, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + }, + "a.b": "d", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_array() { + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d", "c": "e" }, + [35], + ], + "a.b": "f", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d", "f"], + "a.c": "e", + "a": 35, + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays() { + let mut base: Value = json!({ + "a": [ + ["b", "c"], + { "d": "e" }, + ["f", "g"], + [ + { "h": "i" }, + { "d": "j" }, + ], + ["k", "l"], + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "f", "g", "k", "l"], + "a.d": ["e", "j"], + "a.h": "i", + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays_and_objects() { + let mut base: Value = json!({ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!("{}", serde_json::to_string_pretty(&flat).unwrap()); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", + }) + .as_object() + .unwrap() + ); + } +} diff --git a/flatten-serde-json/src/main.rs b/flatten-serde-json/src/main.rs new file mode 100644 index 000000000..dabb386f1 --- /dev/null +++ b/flatten-serde-json/src/main.rs @@ -0,0 +1,11 @@ +use std::io::stdin; + +use flatten_serde_json::flatten; +use serde_json::{Map, Value}; + +fn main() { + let json: Map = serde_json::from_reader(stdin()).unwrap(); + + let result = flatten(&json); + println!("{}", serde_json::to_string_pretty(&result).unwrap()); +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e8723dc6a..a83cfd6f2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -14,7 +14,7 @@ crossbeam-channel = "0.5.2" either = "1.6.1" fst = "0.4.7" fxhash = "0.2.1" -flatten-serde-json = "0.1.0" +flatten-serde-json = { path = "../flatten-serde-json" } grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }