fix field order of JSON documents

This commit is contained in:
Louis Dureuil 2024-11-07 16:04:23 +01:00
parent 1f5d801271
commit d97af4d8e6
No known key found for this signature in database
4 changed files with 14 additions and 3 deletions

2
Cargo.lock generated
View File

@ -3538,6 +3538,7 @@ version = "1.11.0"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"anyhow", "anyhow",
"bumpalo",
"convert_case 0.6.0", "convert_case 0.6.0",
"csv", "csv",
"deserr", "deserr",
@ -3550,6 +3551,7 @@ dependencies = [
"meili-snap", "meili-snap",
"memmap2", "memmap2",
"milli", "milli",
"raw-collections",
"roaring", "roaring",
"serde", "serde",
"serde-cs", "serde-cs",

View File

@ -13,6 +13,7 @@ license.workspace = true
[dependencies] [dependencies]
actix-web = { version = "4.8.0", default-features = false } actix-web = { version = "4.8.0", default-features = false }
anyhow = "1.0.86" anyhow = "1.0.86"
bumpalo = "3.16.0"
convert_case = "0.6.0" convert_case = "0.6.0"
csv = "1.3.0" csv = "1.3.0"
deserr = { version = "0.6.2", features = ["actix-web"] } deserr = { version = "0.6.2", features = ["actix-web"] }
@ -23,6 +24,7 @@ flate2 = "1.0.30"
fst = "0.4.7" fst = "0.4.7"
memmap2 = "0.9.4" memmap2 = "0.9.4"
milli = { path = "../milli" } milli = { path = "../milli" }
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
roaring = { version = "0.10.6", features = ["serde"] } roaring = { version = "0.10.6", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }
serde-cs = "0.2.4" serde-cs = "0.2.4"
@ -70,4 +72,3 @@ swedish-recomposition = ["milli/swedish-recomposition"]
german = ["milli/german"] german = ["milli/german"]
# allow turkish normalization # allow turkish normalization
turkish = ["milli/turkish"] turkish = ["milli/turkish"]

View File

@ -3,13 +3,16 @@ use std::fs::File;
use std::io::{self, BufWriter}; use std::io::{self, BufWriter};
use std::marker::PhantomData; use std::marker::PhantomData;
use bumpalo::Bump;
use memmap2::Mmap; use memmap2::Mmap;
use milli::documents::Error; use milli::documents::Error;
use milli::update::new::TopLevelMap; use milli::update::new::TopLevelMap;
use milli::Object; use milli::Object;
use raw_collections::RawMap;
use serde::de::{SeqAccess, Visitor}; use serde::de::{SeqAccess, Visitor};
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use serde_json::error::Category; use serde_json::error::Category;
use serde_json::value::RawValue;
use serde_json::{to_writer, Map, Value}; use serde_json::{to_writer, Map, Value};
use crate::error::{Code, ErrorCode}; use crate::error::{Code, ErrorCode};
@ -213,10 +216,15 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
// We memory map to be able to deserailize into a TopLevelMap<'pl> that // We memory map to be able to deserailize into a TopLevelMap<'pl> that
// does not allocate when possible and only materialize the first/top level. // does not allocate when possible and only materialize the first/top level.
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
let mut out = BufWriter::new(output); let mut out = BufWriter::new(output);
let mut deserializer = serde_json::Deserializer::from_slice(&input); let mut deserializer = serde_json::Deserializer::from_slice(&input);
let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) { let count = match array_each(&mut deserializer, |obj: &RawValue| {
doc_alloc.reset();
let map = RawMap::from_raw_value(obj, &doc_alloc)?;
to_writer(&mut out, &map)
}) {
// The json data has been deserialized and does not need to be processed again. // The json data has been deserialized and does not need to be processed again.
// The data has been transferred to the writer during the deserialization process. // The data has been transferred to the writer during the deserialization process.
Ok(Ok(count)) => count, Ok(Ok(count)) => count,

View File

@ -198,7 +198,7 @@ where
document_extractor_data.docids_delta.apply_to(document_ids); document_extractor_data.docids_delta.apply_to(document_ids);
} }
field_distribution.retain(|_, v| *v == 0); field_distribution.retain(|_, v| *v != 0);
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
let current_num_threads = rayon::current_num_threads(); let current_num_threads = rayon::current_num_threads();