2021-08-16 13:36:30 +02:00
|
|
|
use std::fs::File;
|
2023-09-28 16:26:01 +02:00
|
|
|
use std::io::{self, BufReader, BufWriter, Seek};
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2024-08-30 11:49:47 +02:00
|
|
|
use grenad::{CompressionType, MergeFunction, Sorter};
|
2023-11-27 11:52:22 +01:00
|
|
|
use heed::types::Bytes;
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2024-08-30 11:49:47 +02:00
|
|
|
use super::ClonableMmap;
|
2023-11-08 16:41:26 +01:00
|
|
|
use crate::update::index_documents::valid_lmdb_key;
|
2021-08-16 13:36:30 +02:00
|
|
|
use crate::Result;
|
|
|
|
|
2024-02-06 10:47:04 +01:00
|
|
|
/// This is something reasonable given the fact
|
|
|
|
/// that there is one grenad sorter by thread.
|
2024-02-07 11:24:43 +01:00
|
|
|
const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
|
2024-02-06 10:47:04 +01:00
|
|
|
|
2021-08-16 13:36:30 +02:00
|
|
|
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
|
|
|
|
|
|
|
pub fn create_writer<R: io::Write>(
|
|
|
|
typ: grenad::CompressionType,
|
|
|
|
level: Option<u32>,
|
|
|
|
file: R,
|
2023-09-28 16:26:01 +02:00
|
|
|
) -> grenad::Writer<BufWriter<R>> {
|
2021-08-16 13:36:30 +02:00
|
|
|
let mut builder = grenad::Writer::builder();
|
|
|
|
builder.compression_type(typ);
|
|
|
|
if let Some(level) = level {
|
|
|
|
builder.compression_level(level);
|
|
|
|
}
|
2023-09-28 16:26:01 +02:00
|
|
|
builder.build(BufWriter::new(file))
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
|
2024-02-06 10:47:04 +01:00
|
|
|
/// A helper function that creates a grenad sorter
|
|
|
|
/// with the given parameters. The max memory is
|
|
|
|
/// clamped to something reasonable.
|
2024-08-30 11:49:47 +02:00
|
|
|
pub fn create_sorter<MF: MergeFunction>(
|
2022-09-13 10:40:37 +02:00
|
|
|
sort_algorithm: grenad::SortAlgorithm,
|
2024-08-30 11:49:47 +02:00
|
|
|
merge: MF,
|
2021-08-16 13:36:30 +02:00
|
|
|
chunk_compression_type: grenad::CompressionType,
|
|
|
|
chunk_compression_level: Option<u32>,
|
|
|
|
max_nb_chunks: Option<usize>,
|
|
|
|
max_memory: Option<usize>,
|
2024-10-17 09:30:18 +02:00
|
|
|
sort_in_parallel: bool,
|
2024-08-30 11:49:47 +02:00
|
|
|
) -> grenad::Sorter<MF> {
|
2021-08-16 13:36:30 +02:00
|
|
|
let mut builder = grenad::Sorter::builder(merge);
|
|
|
|
builder.chunk_compression_type(chunk_compression_type);
|
|
|
|
if let Some(level) = chunk_compression_level {
|
|
|
|
builder.chunk_compression_level(level);
|
|
|
|
}
|
|
|
|
if let Some(nb_chunks) = max_nb_chunks {
|
|
|
|
builder.max_nb_chunks(nb_chunks);
|
|
|
|
}
|
|
|
|
if let Some(memory) = max_memory {
|
2024-02-06 10:47:04 +01:00
|
|
|
builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
|
2021-08-16 13:36:30 +02:00
|
|
|
builder.allow_realloc(false);
|
|
|
|
}
|
2022-09-13 10:40:37 +02:00
|
|
|
builder.sort_algorithm(sort_algorithm);
|
2024-10-17 09:30:18 +02:00
|
|
|
builder.sort_in_parallel(sort_in_parallel);
|
2021-08-16 13:36:30 +02:00
|
|
|
builder.build()
|
|
|
|
}
|
|
|
|
|
2024-01-23 09:42:48 +01:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
2024-08-30 11:49:47 +02:00
|
|
|
pub fn sorter_into_reader<MF>(
|
|
|
|
sorter: grenad::Sorter<MF>,
|
2021-08-16 13:36:30 +02:00
|
|
|
indexer: GrenadParameters,
|
2024-08-30 11:49:47 +02:00
|
|
|
) -> Result<grenad::Reader<BufReader<File>>>
|
|
|
|
where
|
|
|
|
MF: MergeFunction,
|
|
|
|
crate::Error: From<MF::Error>,
|
|
|
|
{
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut writer = create_writer(
|
|
|
|
indexer.chunk_compression_type,
|
|
|
|
indexer.chunk_compression_level,
|
|
|
|
tempfile::tempfile()?,
|
|
|
|
);
|
|
|
|
sorter.write_into_stream_writer(&mut writer)?;
|
|
|
|
|
2022-10-13 22:02:54 +02:00
|
|
|
writer_into_reader(writer)
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:26:01 +02:00
|
|
|
pub fn writer_into_reader(
|
|
|
|
writer: grenad::Writer<BufWriter<File>>,
|
|
|
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
2023-10-10 15:12:23 +02:00
|
|
|
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
2023-01-30 17:17:35 +01:00
|
|
|
file.rewind()?;
|
2023-09-28 16:26:01 +02:00
|
|
|
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
|
2022-02-16 15:40:08 +01:00
|
|
|
pub unsafe fn as_cloneable_grenad(
|
2023-09-28 16:26:01 +02:00
|
|
|
reader: &grenad::Reader<BufReader<File>>,
|
2021-08-16 13:36:30 +02:00
|
|
|
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
2023-09-28 16:26:01 +02:00
|
|
|
let file = reader.get_ref().get_ref();
|
2022-02-16 15:40:08 +01:00
|
|
|
let mmap = memmap2::Mmap::map(file)?;
|
2021-08-16 13:36:30 +02:00
|
|
|
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
|
|
|
let reader = grenad::Reader::new(cursor)?;
|
|
|
|
Ok(reader)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
pub struct GrenadParameters {
|
|
|
|
pub chunk_compression_type: CompressionType,
|
|
|
|
pub chunk_compression_level: Option<u32>,
|
|
|
|
pub max_memory: Option<usize>,
|
|
|
|
pub max_nb_chunks: Option<usize>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for GrenadParameters {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self {
|
|
|
|
chunk_compression_type: CompressionType::None,
|
|
|
|
chunk_compression_level: None,
|
|
|
|
max_memory: None,
|
|
|
|
max_nb_chunks: None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl GrenadParameters {
|
2021-09-02 15:17:52 +02:00
|
|
|
/// This function use the number of threads in the current threadpool to compute the value.
|
2024-02-06 10:47:04 +01:00
|
|
|
///
|
2021-09-02 15:17:52 +02:00
|
|
|
/// This should be called inside of a rayon thread pool,
|
2024-02-06 10:47:04 +01:00
|
|
|
/// otherwise, it will take the global number of threads.
|
|
|
|
///
|
|
|
|
/// The max memory cannot exceed a given reasonable value.
|
2021-08-16 13:36:30 +02:00
|
|
|
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
2024-02-06 10:47:04 +01:00
|
|
|
self.max_memory.map(|max_memory| {
|
|
|
|
(max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
|
|
|
|
})
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns an iterator that outputs grenad readers of obkv documents
|
|
|
|
/// with a maximum size of approximately `documents_chunks_size`.
|
|
|
|
///
|
|
|
|
/// The grenad obkv entries are composed of an incremental document id big-endian
|
|
|
|
/// encoded as the key and an obkv object with an `u8` for the field as the key
|
|
|
|
/// and a simple UTF-8 encoded string as the value.
|
2022-02-16 15:28:48 +01:00
|
|
|
pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|
|
|
reader: grenad::Reader<R>,
|
2021-08-16 13:36:30 +02:00
|
|
|
indexer: GrenadParameters,
|
2021-09-02 15:17:52 +02:00
|
|
|
documents_chunk_size: usize,
|
2023-09-28 16:26:01 +02:00
|
|
|
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
2021-08-16 13:36:30 +02:00
|
|
|
let mut continue_reading = true;
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut cursor = reader.into_cursor()?;
|
2021-08-16 13:36:30 +02:00
|
|
|
|
|
|
|
let mut transposer = move || {
|
|
|
|
if !continue_reading {
|
|
|
|
return Ok(None);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut current_chunk_size = 0u64;
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut obkv_documents = create_writer(
|
2022-10-13 22:02:54 +02:00
|
|
|
indexer.chunk_compression_type,
|
|
|
|
indexer.chunk_compression_level,
|
2022-02-16 15:28:48 +01:00
|
|
|
tempfile::tempfile()?,
|
|
|
|
);
|
|
|
|
|
|
|
|
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
2023-11-02 16:49:03 +01:00
|
|
|
if !obkv.is_empty() {
|
|
|
|
obkv_documents.insert(document_id, obkv)?;
|
|
|
|
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2023-11-02 16:49:03 +01:00
|
|
|
if current_chunk_size >= documents_chunk_size as u64 {
|
|
|
|
return writer_into_reader(obkv_documents).map(Some);
|
|
|
|
}
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
continue_reading = false;
|
|
|
|
writer_into_reader(obkv_documents).map(Some)
|
|
|
|
};
|
|
|
|
|
2021-09-02 16:57:46 +02:00
|
|
|
Ok(std::iter::from_fn(move || transposer().transpose()))
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
|
2023-11-08 16:41:26 +01:00
|
|
|
/// Write provided sorter in database using serialize_value function.
|
|
|
|
/// merge_values function is used if an entry already exist in the database.
|
2024-01-23 09:42:48 +01:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
2024-08-30 11:49:47 +02:00
|
|
|
pub fn write_sorter_into_database<K, V, FS, FM, MF>(
|
|
|
|
sorter: Sorter<MF>,
|
2023-11-08 16:41:26 +01:00
|
|
|
database: &heed::Database<K, V>,
|
2024-07-09 11:25:39 -04:00
|
|
|
wtxn: &mut heed::RwTxn<'_>,
|
2023-11-08 16:41:26 +01:00
|
|
|
index_is_empty: bool,
|
|
|
|
serialize_value: FS,
|
|
|
|
merge_values: FM,
|
|
|
|
) -> Result<()>
|
|
|
|
where
|
|
|
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
|
|
|
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
2024-08-30 11:49:47 +02:00
|
|
|
MF: MergeFunction,
|
|
|
|
crate::Error: From<MF::Error>,
|
2023-11-08 16:41:26 +01:00
|
|
|
{
|
|
|
|
let mut buffer = Vec::new();
|
2023-11-27 11:52:22 +01:00
|
|
|
let database = database.remap_types::<Bytes, Bytes>();
|
2023-11-08 16:41:26 +01:00
|
|
|
|
|
|
|
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
|
|
|
while let Some((key, value)) = merger_iter.next()? {
|
|
|
|
if valid_lmdb_key(key) {
|
|
|
|
buffer.clear();
|
|
|
|
let value = if index_is_empty {
|
|
|
|
Some(serialize_value(value, &mut buffer)?)
|
|
|
|
} else {
|
|
|
|
match database.get(wtxn, key)? {
|
|
|
|
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
|
|
|
None => Some(serialize_value(value, &mut buffer)?),
|
|
|
|
}
|
|
|
|
};
|
|
|
|
match value {
|
|
|
|
Some(value) => database.put(wtxn, key, value)?,
|
|
|
|
None => {
|
|
|
|
database.delete(wtxn, key)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|