refactor spawn_extraction_task

This commit is contained in:
ad hoc 2022-03-23 14:48:15 +01:00
parent f82d4b36eb
commit 5f9f82757d
No known key found for this signature in database
GPG Key ID: 4F00A782990CC643
3 changed files with 69 additions and 30 deletions

View File

@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{ use super::helpers::{
as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader,
}; };
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::{FieldId, Result}; use crate::{FieldId, Result};
@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents(
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
) = result?; ) = result?;
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents(
"word-pair-proximity-docids", "word-pair-proximity-docids",
); );
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents(
"field-id-wordcount-docids", "field-id-wordcount-docids",
); );
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents(
"word-docids", "word-docids",
); );
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents(
"word-position-docids", "word-position-docids",
); );
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_strings_chunks.clone(), docid_fid_facet_strings_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents(
"field-id-facet-string-docids", "field-id-facet-string-docids",
); );
spawn_extraction_task( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_numbers_chunks.clone(), docid_fid_facet_numbers_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
/// Generated grenad chunks are merged using the merge_fn. /// Generated grenad chunks are merged using the merge_fn.
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// The result of merged chunks is serialized as TypedChunk using the serialize_fn
/// and sent into lmdb_writer_sx. /// and sent into lmdb_writer_sx.
fn spawn_extraction_task<FE, FS>( fn spawn_extraction_task<FE, FS, M>(
chunks: Vec<grenad::Reader<CursorClonableMmap>>, chunks: Vec<grenad::Reader<CursorClonableMmap>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
@ -142,19 +142,21 @@ fn spawn_extraction_task<FE, FS>(
serialize_fn: FS, serialize_fn: FS,
name: &'static str, name: &'static str,
) where ) where
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>> FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
+ Sync + Sync
+ Send + Send
+ 'static, + 'static,
FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static, FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
M::Output: Send,
{ {
rayon::spawn(move || { rayon::spawn(move || {
let chunks: Result<Vec<_>> = let chunks: Result<M> =
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
rayon::spawn(move || match chunks { rayon::spawn(move || match chunks {
Ok(chunks) => { Ok(chunks) => {
debug!("merge {} database", name); debug!("merge {} database", name);
let reader = merge_readers(chunks, merge_fn, indexer); let reader = chunks.merge(merge_fn, &indexer);
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
} }
Err(e) => { Err(e) => {

View File

@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad(
Ok(reader) Ok(reader)
} }
pub fn merge_readers<R: io::Read + io::Seek>( pub trait MergeableReader
readers: Vec<grenad::Reader<R>>, where
merge_fn: MergeFn, Self: Sized,
indexer: GrenadParameters, {
) -> Result<grenad::Reader<File>> { type Output;
let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
for reader in readers { fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
merger_builder.push(reader.into_cursor()?); }
impl MergeableReader for Vec<grenad::Reader<File>> {
type Output = grenad::Reader<File>;
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut merger = MergerBuilder::new(merge_fn);
self.into_iter().try_for_each(|r| merger.push(r))?;
merger.finish(params)
}
}
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
let mut m2 = MergerBuilder::new(merge_fn);
for (r1, r2) in self.into_iter() {
m1.push(r1)?;
m2.push(r2)?;
}
Ok((m1.finish(params)?, m2.finish(params)?))
}
}
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
impl<R: io::Read + io::Seek> MergerBuilder<R> {
fn new(merge_fn: MergeFn) -> Self {
Self(grenad::MergerBuilder::new(merge_fn))
} }
let merger = merger_builder.build(); fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
let mut writer = create_writer( self.0.push(reader.into_cursor()?);
indexer.chunk_compression_type, Ok(())
indexer.chunk_compression_level, }
tempfile::tempfile()?,
);
merger.write_into_stream_writer(&mut writer)?;
Ok(writer_into_reader(writer)?) fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
let merger = self.0.build();
let mut writer = create_writer(
params.chunk_compression_type,
params.chunk_compression_level,
tempfile::tempfile()?,
);
merger.write_into_stream_writer(&mut writer)?;
Ok(writer_into_reader(writer)?)
}
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]

View File

@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto};
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
pub use grenad_helpers::{ pub use grenad_helpers::{
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
GrenadParameters, GrenadParameters, MergeableReader,
}; };
pub use merge_functions::{ pub use merge_functions::{
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,