2022-03-04 10:46:59 +08:00
|
|
|
use std::borrow::Borrow;
|
|
|
|
use std::fmt::{self, Debug, Display};
|
2021-10-21 03:20:28 +08:00
|
|
|
use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Seek, Write};
|
2021-09-28 17:59:55 +08:00
|
|
|
|
2021-11-09 01:31:27 +08:00
|
|
|
use meilisearch_error::{internal_error, Code, ErrorCode};
|
2021-09-28 17:59:55 +08:00
|
|
|
use milli::documents::DocumentBatchBuilder;
|
|
|
|
|
|
|
|
type Result<T> = std::result::Result<T, DocumentFormatError>;
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub enum PayloadType {
|
2021-09-29 16:17:52 +08:00
|
|
|
Ndjson,
|
2021-09-29 02:13:26 +08:00
|
|
|
Json,
|
2021-09-29 04:58:48 +08:00
|
|
|
Csv,
|
2021-09-28 17:59:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl fmt::Display for PayloadType {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
match self {
|
2021-09-29 16:17:52 +08:00
|
|
|
PayloadType::Ndjson => write!(f, "ndjson"),
|
2021-09-29 02:13:26 +08:00
|
|
|
PayloadType::Json => write!(f, "json"),
|
2021-09-29 04:58:48 +08:00
|
|
|
PayloadType::Csv => write!(f, "csv"),
|
2021-09-28 17:59:55 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-04 10:46:59 +08:00
|
|
|
#[derive(Debug)]
|
2021-09-28 17:59:55 +08:00
|
|
|
pub enum DocumentFormatError {
|
|
|
|
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
|
2022-03-04 10:46:59 +08:00
|
|
|
MalformedPayload(Box<milli::documents::Error>, PayloadType),
|
|
|
|
}
|
|
|
|
impl Display for DocumentFormatError {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
match self {
|
2022-03-08 12:03:59 +08:00
|
|
|
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
|
2022-03-04 10:46:59 +08:00
|
|
|
Self::MalformedPayload(me, b) => match me.borrow() {
|
2022-03-04 15:31:11 +08:00
|
|
|
milli::documents::Error::JsonError(se) => write!(
|
2022-03-04 10:46:59 +08:00
|
|
|
f,
|
2022-03-08 12:03:59 +08:00
|
|
|
"The `{}` payload provided is malformed. `Couldn't serialize document value at line {} column {}`",
|
|
|
|
b, se.line(), se.column()
|
2022-03-04 10:46:59 +08:00
|
|
|
),
|
2022-03-08 12:03:59 +08:00
|
|
|
_ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me),
|
2022-03-04 10:46:59 +08:00
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
2021-09-28 17:59:55 +08:00
|
|
|
}
|
2022-03-04 10:46:59 +08:00
|
|
|
impl std::error::Error for DocumentFormatError {}
|
2021-09-28 17:59:55 +08:00
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
impl From<(PayloadType, milli::documents::Error)> for DocumentFormatError {
|
|
|
|
fn from((ty, error): (PayloadType, milli::documents::Error)) -> Self {
|
|
|
|
match error {
|
|
|
|
milli::documents::Error::Io(e) => Self::Internal(Box::new(e)),
|
|
|
|
e => Self::MalformedPayload(Box::new(e), ty),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-30 16:35:24 +08:00
|
|
|
impl ErrorCode for DocumentFormatError {
|
|
|
|
fn error_code(&self) -> Code {
|
|
|
|
match self {
|
|
|
|
DocumentFormatError::Internal(_) => Code::Internal,
|
|
|
|
DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
internal_error!(DocumentFormatError: io::Error);
|
2021-09-28 17:59:55 +08:00
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
/// reads csv from input and write an obkv batch to writer.
|
2021-12-02 23:03:26 +08:00
|
|
|
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
2021-10-21 03:20:28 +08:00
|
|
|
let writer = BufWriter::new(writer);
|
2021-10-27 01:36:48 +08:00
|
|
|
let builder =
|
|
|
|
DocumentBatchBuilder::from_csv(input, writer).map_err(|e| (PayloadType::Csv, e))?;
|
2021-10-28 18:13:51 +08:00
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
let count = builder.finish().map_err(|e| (PayloadType::Csv, e))?;
|
2021-09-29 04:58:48 +08:00
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
Ok(count)
|
2021-09-29 04:58:48 +08:00
|
|
|
}
|
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
/// reads jsonl from input and write an obkv batch to writer.
|
2021-12-02 23:03:26 +08:00
|
|
|
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
2021-10-21 03:20:28 +08:00
|
|
|
let mut reader = BufReader::new(input);
|
|
|
|
let writer = BufWriter::new(writer);
|
|
|
|
|
|
|
|
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Ndjson, e))?;
|
|
|
|
let mut buf = String::new();
|
2021-09-28 17:59:55 +08:00
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
while reader.read_line(&mut buf)? > 0 {
|
2022-01-13 19:30:35 +08:00
|
|
|
// skip empty lines
|
|
|
|
if buf == "\n" {
|
|
|
|
buf.clear();
|
|
|
|
continue;
|
|
|
|
}
|
2021-10-21 03:20:28 +08:00
|
|
|
builder
|
|
|
|
.extend_from_json(Cursor::new(&buf.as_bytes()))
|
|
|
|
.map_err(|e| (PayloadType::Ndjson, e))?;
|
|
|
|
buf.clear();
|
2021-09-28 17:59:55 +08:00
|
|
|
}
|
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
let count = builder.finish().map_err(|e| (PayloadType::Ndjson, e))?;
|
2021-09-28 17:59:55 +08:00
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
Ok(count)
|
2021-09-28 17:59:55 +08:00
|
|
|
}
|
2021-09-29 02:13:26 +08:00
|
|
|
|
2021-10-21 03:20:28 +08:00
|
|
|
/// reads json from input and write an obkv batch to writer.
|
2021-12-02 23:03:26 +08:00
|
|
|
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
2021-10-21 03:20:28 +08:00
|
|
|
let writer = BufWriter::new(writer);
|
|
|
|
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Json, e))?;
|
|
|
|
builder
|
|
|
|
.extend_from_json(input)
|
|
|
|
.map_err(|e| (PayloadType::Json, e))?;
|
2021-10-28 18:13:51 +08:00
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
let count = builder.finish().map_err(|e| (PayloadType::Json, e))?;
|
2021-09-29 02:13:26 +08:00
|
|
|
|
2021-12-02 23:03:26 +08:00
|
|
|
Ok(count)
|
2021-09-29 02:13:26 +08:00
|
|
|
}
|