mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-12-04 02:55:20 +08:00
76 lines
2.5 KiB
Rust
76 lines
2.5 KiB
Rust
|
use std::io;
|
||
|
use std::io::{BufReader, Read};
|
||
|
use std::mem::size_of;
|
||
|
|
||
|
use byteorder::{BigEndian, ReadBytesExt};
|
||
|
use obkv::KvReader;
|
||
|
|
||
|
use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
|
||
|
use crate::FieldId;
|
||
|
|
||
|
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
|
||
|
/// a `DocumentsBatchWriter`.
|
||
|
///
|
||
|
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||
|
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||
|
pub struct DocumentBatchReader<R> {
|
||
|
reader: BufReader<R>,
|
||
|
metadata: DocumentsMetadata,
|
||
|
buffer: Vec<u8>,
|
||
|
seen_documents: usize,
|
||
|
}
|
||
|
|
||
|
impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
|
||
|
/// Construct a `DocumentsReader` from a reader.
|
||
|
///
|
||
|
/// It first retrieves the index, then moves to the first document. Subsequent calls to
|
||
|
/// `next_document` advance the document reader until all the documents have been read.
|
||
|
pub fn from_reader(mut reader: R) -> Result<Self, Error> {
|
||
|
let mut buffer = Vec::new();
|
||
|
|
||
|
let meta_offset = reader.read_u64::<BigEndian>()?;
|
||
|
reader.seek(io::SeekFrom::Start(meta_offset))?;
|
||
|
reader.read_to_end(&mut buffer)?;
|
||
|
let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
|
||
|
|
||
|
reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
|
||
|
buffer.clear();
|
||
|
|
||
|
let reader = BufReader::new(reader);
|
||
|
|
||
|
Ok(Self { reader, metadata, buffer, seen_documents: 0 })
|
||
|
}
|
||
|
|
||
|
/// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
|
||
|
/// reference to the addition index.
|
||
|
pub fn next_document_with_index<'a>(
|
||
|
&'a mut self,
|
||
|
) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
|
||
|
if self.seen_documents < self.metadata.count {
|
||
|
let doc_len = self.reader.read_u32::<BigEndian>()?;
|
||
|
self.buffer.resize(doc_len as usize, 0);
|
||
|
self.reader.read_exact(&mut self.buffer)?;
|
||
|
self.seen_documents += 1;
|
||
|
|
||
|
let reader = KvReader::new(&self.buffer);
|
||
|
Ok(Some((&self.metadata.index, reader)))
|
||
|
} else {
|
||
|
Ok(None)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Return the fields index for the documents batch.
|
||
|
pub fn index(&self) -> &DocumentsBatchIndex {
|
||
|
&self.metadata.index
|
||
|
}
|
||
|
|
||
|
/// Returns the number of documents in the reader.
|
||
|
pub fn len(&self) -> usize {
|
||
|
self.metadata.count
|
||
|
}
|
||
|
|
||
|
pub fn is_empty(&self) -> bool {
|
||
|
self.len() == 0
|
||
|
}
|
||
|
}
|