From d97af4d8e6823c67f743b37c2ffdb65deb319445 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 16:04:23 +0100 Subject: [PATCH] fix field order of JSON documents --- Cargo.lock | 2 ++ crates/meilisearch-types/Cargo.toml | 3 ++- crates/meilisearch-types/src/document_formats.rs | 10 +++++++++- crates/milli/src/update/new/indexer/mod.rs | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b0e5978b5..c3222c7fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3538,6 +3538,7 @@ version = "1.11.0" dependencies = [ "actix-web", "anyhow", + "bumpalo", "convert_case 0.6.0", "csv", "deserr", @@ -3550,6 +3551,7 @@ dependencies = [ "meili-snap", "memmap2", "milli", + "raw-collections", "roaring", "serde", "serde-cs", diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 0dae024f2..3bd368e7c 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] actix-web = { version = "4.8.0", default-features = false } anyhow = "1.0.86" +bumpalo = "3.16.0" convert_case = "0.6.0" csv = "1.3.0" deserr = { version = "0.6.2", features = ["actix-web"] } @@ -23,6 +24,7 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } +raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } roaring = { version = "0.10.6", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" @@ -70,4 +72,3 @@ swedish-recomposition = ["milli/swedish-recomposition"] german = ["milli/german"] # allow turkish normalization turkish = ["milli/turkish"] - diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index b40c4d0b6..db893f880 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -3,13 +3,16 @@ use std::fs::File; use std::io::{self, BufWriter}; use std::marker::PhantomData; +use bumpalo::Bump; use memmap2::Mmap; use milli::documents::Error; use milli::update::new::TopLevelMap; use milli::Object; +use raw_collections::RawMap; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; +use serde_json::value::RawValue; use serde_json::{to_writer, Map, Value}; use crate::error::{Code, ErrorCode}; @@ -213,10 +216,15 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserailize into a TopLevelMap<'pl> that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); - let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) { + let count = match array_each(&mut deserializer, |obj: &RawValue| { + doc_alloc.reset(); + let map = RawMap::from_raw_value(obj, &doc_alloc)?; + to_writer(&mut out, &map) + }) { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. Ok(Ok(count)) => count, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 70ac7f959..3b66c2ec0 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -198,7 +198,7 @@ where document_extractor_data.docids_delta.apply_to(document_ids); } - field_distribution.retain(|_, v| *v == 0); + field_distribution.retain(|_, v| *v != 0); const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; let current_num_threads = rayon::current_num_threads();