mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
fix field order of JSON documents
This commit is contained in:
parent
1f5d801271
commit
d97af4d8e6
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -3538,6 +3538,7 @@ version = "1.11.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"bumpalo",
|
||||||
"convert_case 0.6.0",
|
"convert_case 0.6.0",
|
||||||
"csv",
|
"csv",
|
||||||
"deserr",
|
"deserr",
|
||||||
@ -3550,6 +3551,7 @@ dependencies = [
|
|||||||
"meili-snap",
|
"meili-snap",
|
||||||
"memmap2",
|
"memmap2",
|
||||||
"milli",
|
"milli",
|
||||||
|
"raw-collections",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
"serde-cs",
|
"serde-cs",
|
||||||
|
@ -13,6 +13,7 @@ license.workspace = true
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
actix-web = { version = "4.8.0", default-features = false }
|
actix-web = { version = "4.8.0", default-features = false }
|
||||||
anyhow = "1.0.86"
|
anyhow = "1.0.86"
|
||||||
|
bumpalo = "3.16.0"
|
||||||
convert_case = "0.6.0"
|
convert_case = "0.6.0"
|
||||||
csv = "1.3.0"
|
csv = "1.3.0"
|
||||||
deserr = { version = "0.6.2", features = ["actix-web"] }
|
deserr = { version = "0.6.2", features = ["actix-web"] }
|
||||||
@ -23,6 +24,7 @@ flate2 = "1.0.30"
|
|||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
memmap2 = "0.9.4"
|
memmap2 = "0.9.4"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
|
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
||||||
roaring = { version = "0.10.6", features = ["serde"] }
|
roaring = { version = "0.10.6", features = ["serde"] }
|
||||||
serde = { version = "1.0.204", features = ["derive"] }
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
serde-cs = "0.2.4"
|
serde-cs = "0.2.4"
|
||||||
@ -70,4 +72,3 @@ swedish-recomposition = ["milli/swedish-recomposition"]
|
|||||||
german = ["milli/german"]
|
german = ["milli/german"]
|
||||||
# allow turkish normalization
|
# allow turkish normalization
|
||||||
turkish = ["milli/turkish"]
|
turkish = ["milli/turkish"]
|
||||||
|
|
||||||
|
@ -3,13 +3,16 @@ use std::fs::File;
|
|||||||
use std::io::{self, BufWriter};
|
use std::io::{self, BufWriter};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use milli::documents::Error;
|
use milli::documents::Error;
|
||||||
use milli::update::new::TopLevelMap;
|
use milli::update::new::TopLevelMap;
|
||||||
use milli::Object;
|
use milli::Object;
|
||||||
|
use raw_collections::RawMap;
|
||||||
use serde::de::{SeqAccess, Visitor};
|
use serde::de::{SeqAccess, Visitor};
|
||||||
use serde::{Deserialize, Deserializer};
|
use serde::{Deserialize, Deserializer};
|
||||||
use serde_json::error::Category;
|
use serde_json::error::Category;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
use serde_json::{to_writer, Map, Value};
|
use serde_json::{to_writer, Map, Value};
|
||||||
|
|
||||||
use crate::error::{Code, ErrorCode};
|
use crate::error::{Code, ErrorCode};
|
||||||
@ -213,10 +216,15 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
|
|||||||
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
||||||
// does not allocate when possible and only materialize the first/top level.
|
// does not allocate when possible and only materialize the first/top level.
|
||||||
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
||||||
|
let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
|
||||||
|
|
||||||
let mut out = BufWriter::new(output);
|
let mut out = BufWriter::new(output);
|
||||||
let mut deserializer = serde_json::Deserializer::from_slice(&input);
|
let mut deserializer = serde_json::Deserializer::from_slice(&input);
|
||||||
let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) {
|
let count = match array_each(&mut deserializer, |obj: &RawValue| {
|
||||||
|
doc_alloc.reset();
|
||||||
|
let map = RawMap::from_raw_value(obj, &doc_alloc)?;
|
||||||
|
to_writer(&mut out, &map)
|
||||||
|
}) {
|
||||||
// The json data has been deserialized and does not need to be processed again.
|
// The json data has been deserialized and does not need to be processed again.
|
||||||
// The data has been transferred to the writer during the deserialization process.
|
// The data has been transferred to the writer during the deserialization process.
|
||||||
Ok(Ok(count)) => count,
|
Ok(Ok(count)) => count,
|
||||||
|
@ -198,7 +198,7 @@ where
|
|||||||
document_extractor_data.docids_delta.apply_to(document_ids);
|
document_extractor_data.docids_delta.apply_to(document_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
field_distribution.retain(|_, v| *v == 0);
|
field_distribution.retain(|_, v| *v != 0);
|
||||||
|
|
||||||
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
|
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
|
||||||
let current_num_threads = rayon::current_num_threads();
|
let current_num_threads = rayon::current_num_threads();
|
||||||
|
Loading…
Reference in New Issue
Block a user