Factorize some stuff

This commit is contained in:
Clément Renault 2024-09-04 12:17:13 +02:00
parent 6d74fb0229
commit 98e48371c3
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
3 changed files with 166 additions and 93 deletions

View File

@ -9,7 +9,7 @@ use super::StdResult;
use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY};
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::update::MergeDeladdCboRoaringBitmaps; use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{DocumentId, Index}; use crate::{CboRoaringBitmapCodec, DocumentId, Index};
/// The capacity of the channel is currently in number of messages. /// The capacity of the channel is currently in number of messages.
pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) {
@ -103,7 +103,9 @@ pub struct WriterOperation {
pub enum Database { pub enum Database {
WordDocids, WordDocids,
ExactWordDocids,
WordFidDocids, WordFidDocids,
WordPositionDocids,
Documents, Documents,
Main, Main,
} }
@ -114,7 +116,9 @@ impl WriterOperation {
Database::Main => index.main.remap_types(), Database::Main => index.main.remap_types(),
Database::Documents => index.documents.remap_types(), Database::Documents => index.documents.remap_types(),
Database::WordDocids => index.word_docids.remap_types(), Database::WordDocids => index.word_docids.remap_types(),
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
Database::WordFidDocids => index.word_fid_docids.remap_types(), Database::WordFidDocids => index.word_fid_docids.remap_types(),
Database::WordPositionDocids => index.word_position_docids.remap_types(),
} }
} }
@ -141,11 +145,7 @@ impl MergerSender {
MainSender(&self.0) MainSender(&self.0)
} }
pub fn word_docids(&self) -> DocidsSender<'_, WordDocids> { pub fn docids<D: DatabaseType>(&self) -> DocidsSender<'_, D> {
DocidsSender { sender: &self.0, _marker: PhantomData }
}
pub fn word_fid_docids(&self) -> DocidsSender<'_, WordFidDocids> {
DocidsSender { sender: &self.0, _marker: PhantomData } DocidsSender { sender: &self.0, _marker: PhantomData }
} }
@ -187,21 +187,45 @@ impl MainSender<'_> {
} }
pub enum WordDocids {} pub enum WordDocids {}
pub enum ExactWordDocids {}
pub enum WordFidDocids {} pub enum WordFidDocids {}
pub enum WordPositionDocids {}
pub trait DatabaseType { pub trait DatabaseType {
fn database() -> Database; const DATABASE: Database;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation;
} }
impl DatabaseType for WordDocids { impl DatabaseType for WordDocids {
fn database() -> Database { const DATABASE: Database = Database::WordDocids;
Database::WordDocids
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordDocidsMerger(merger)
}
}
impl DatabaseType for ExactWordDocids {
const DATABASE: Database = Database::ExactWordDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::ExactWordDocidsMerger(merger)
} }
} }
impl DatabaseType for WordFidDocids { impl DatabaseType for WordFidDocids {
fn database() -> Database { const DATABASE: Database = Database::WordFidDocids;
Database::WordFidDocids
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordFidDocidsMerger(merger)
}
}
impl DatabaseType for WordPositionDocids {
const DATABASE: Database = Database::WordPositionDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordPositionDocidsMerger(merger)
} }
} }
@ -213,7 +237,7 @@ pub struct DocidsSender<'a, D> {
impl<D: DatabaseType> DocidsSender<'_, D> { impl<D: DatabaseType> DocidsSender<'_, D> {
pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value));
match self.sender.send(WriterOperation { database: D::database(), entry }) { match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
Ok(()) => Ok(()), Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())), Err(SendError(_)) => Err(SendError(())),
} }
@ -221,7 +245,7 @@ impl<D: DatabaseType> DocidsSender<'_, D> {
pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
let entry = EntryOperation::Delete(KeyEntry::from_key(key)); let entry = EntryOperation::Delete(KeyEntry::from_key(key));
match self.sender.send(WriterOperation { database: D::database(), entry }) { match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
Ok(()) => Ok(()), Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())), Err(SendError(_)) => Err(SendError(())),
} }
@ -258,7 +282,9 @@ impl DocumentsSender<'_> {
pub enum MergerOperation { pub enum MergerOperation {
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordPositionDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> }, InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
DeleteDocument { docid: DocumentId }, DeleteDocument { docid: DocumentId },
} }
@ -295,23 +321,11 @@ impl ExtractorSender {
} }
} }
pub fn word_docids( pub fn send_searchable<D: DatabaseType>(
&self, &self,
merger: Merger<File, MergeDeladdCboRoaringBitmaps>, merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
) -> StdResult<(), SendError<()>> { ) -> StdResult<(), SendError<()>> {
let operation = MergerOperation::WordDocidsMerger(merger); match self.0.send(D::new_merger_operation(merger)) {
match self.0.send(operation) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
pub fn word_fid_docids(
&self,
merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
) -> StdResult<(), SendError<()>> {
let operation = MergerOperation::WordFidDocidsMerger(merger);
match self.0.send(operation) {
Ok(()) => Ok(()), Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())), Err(SendError(_)) => Err(SendError(())),
} }

View File

@ -11,14 +11,21 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::ThreadPool; use rayon::ThreadPool;
pub use update_by_function::UpdateByFunction; pub use update_by_function::UpdateByFunction;
use super::channel::{extractors_merger_channels, merger_writer_channel, EntryOperation}; use super::channel::{
extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids,
WordFidDocids, WordPositionDocids,
};
use super::document_change::DocumentChange; use super::document_change::DocumentChange;
use super::extract::{SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor}; use super::extract::{
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor,
};
use super::merger::merge_grenad_entries; use super::merger::merge_grenad_entries;
use super::StdResult; use super::StdResult;
use crate::documents::{ use crate::documents::{
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
}; };
use crate::update::new::channel::{DatabaseType, ExtractorSender};
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
@ -82,36 +89,43 @@ where
let docid = insertion.docid(); let docid = insertion.docid();
let content = insertion.new(); let content = insertion.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap(); extractor_sender.document_insert(docid, content.boxed()).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]); // extracted_dictionary_sender.send(self, dictionary: &[u8]);
} }
} }
Ok(()) as Result<_> Ok(()) as Result<_>
})?; })?;
// word docids extract_and_send_docids::<WordDocidsExtractor, WordDocids>(
let merger = WordDocidsExtractor::run_extraction(
index, index,
&global_fields_ids_map, &global_fields_ids_map,
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
GrenadParameters::default(), GrenadParameters::default(),
document_changes.clone(), document_changes.clone(),
&extractor_sender,
)?; )?;
/// TODO: manage the errors correctly extract_and_send_docids::<WordFidDocidsExtractor, WordFidDocids>(
extractor_sender.word_docids(merger).unwrap();
// word fid docids
let merger = WordFidDocidsExtractor::run_extraction(
index, index,
&global_fields_ids_map, &global_fields_ids_map,
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
GrenadParameters::default(), GrenadParameters::default(),
document_changes.clone(), document_changes.clone(),
&extractor_sender,
)?; )?;
/// TODO: manage the errors correctly extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
extractor_sender.word_fid_docids(merger).unwrap(); index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
Ok(()) as Result<_> Ok(()) as Result<_>
}) })
@ -148,6 +162,20 @@ where
Ok(()) Ok(())
} }
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
/// TODO: manage the errors correctly
/// TODO: we must have a single trait that also gives the extractor type
fn extract_and_send_docids<E: SearchableExtractor, D: DatabaseType>(
index: &Index,
fields_ids_map: &GlobalFieldsIdsMap,
indexer: GrenadParameters,
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
sender: &ExtractorSender,
) -> Result<()> {
let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
Ok(sender.send_searchable::<D>(merger).unwrap())
}
/// TODO move this elsewhere /// TODO move this elsewhere
pub fn guess_primary_key<'a>( pub fn guess_primary_key<'a>(
rtxn: &'a RoTxn<'a>, rtxn: &'a RoTxn<'a>,

View File

@ -1,16 +1,24 @@
use std::fs::File;
use std::io;
use fst::set::OpBuilder; use fst::set::OpBuilder;
use fst::{Set, SetBuilder}; use fst::{Set, SetBuilder};
use grenad::Merger;
use heed::types::Bytes; use heed::types::Bytes;
use heed::RoTxn; use heed::{Database, RoTxn};
use memmap2::Mmap; use memmap2::Mmap;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use tempfile::tempfile; use tempfile::tempfile;
use super::channel::{MergerReceiver, MergerSender}; use super::channel::{
DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids,
WordFidDocids, WordPositionDocids,
};
use super::KvReaderDelAdd; use super::KvReaderDelAdd;
use crate::index::main_key::WORDS_FST_KEY; use crate::index::main_key::WORDS_FST_KEY;
use crate::update::del_add::DelAdd; use crate::update::del_add::DelAdd;
use crate::update::new::channel::MergerOperation; use crate::update::new::channel::MergerOperation;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{CboRoaringBitmapCodec, Index, Result}; use crate::{CboRoaringBitmapCodec, Index, Result};
/// TODO We must return some infos/stats /// TODO We must return some infos/stats
@ -26,34 +34,18 @@ pub fn merge_grenad_entries(
for merger_operation in receiver { for merger_operation in receiver {
match merger_operation { match merger_operation {
MergerOperation::WordDocidsMerger(merger) => { MergerOperation::WordDocidsMerger(merger) => {
let word_docids_sender = sender.word_docids();
let database = index.word_docids.remap_types::<Bytes, Bytes>();
let mut add_words_fst = SetBuilder::new(tempfile()?)?; let mut add_words_fst = SetBuilder::new(tempfile()?)?;
let mut del_words_fst = SetBuilder::new(tempfile()?)?; let mut del_words_fst = SetBuilder::new(tempfile()?)?;
/// TODO manage the error correctly merge_and_send_docids(
let mut merger_iter = merger.into_stream_merger_iter().unwrap(); merger,
index.word_docids.remap_types(),
// TODO manage the error correctly rtxn,
while let Some((key, deladd)) = merger_iter.next().unwrap() { &mut buffer,
let current = database.get(rtxn, key)?; sender.docids::<WordDocids>(),
let deladd: &KvReaderDelAdd = deladd.into(); |key| add_words_fst.insert(key),
let del = deladd.get(DelAdd::Deletion); |key| del_words_fst.insert(key),
let add = deladd.get(DelAdd::Addition); )?;
match merge_cbo_bitmaps(current, del, add)? {
Operation::Write(bitmap) => {
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
word_docids_sender.write(key, value).unwrap();
add_words_fst.insert(key)?;
}
Operation::Delete => {
word_docids_sender.delete(key).unwrap();
del_words_fst.insert(key)?;
}
Operation::Ignore => (),
}
}
// Move that into a dedicated function // Move that into a dedicated function
let words_fst = index.words_fst(rtxn)?; let words_fst = index.words_fst(rtxn)?;
@ -66,7 +58,6 @@ pub fn merge_grenad_entries(
let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? };
let del_words_fst = Set::new(&del_words_fst_mmap)?; let del_words_fst = Set::new(&del_words_fst_mmap)?;
// TO BE IMPROVED @many
let diff = words_fst.op().add(&del_words_fst).difference(); let diff = words_fst.op().add(&del_words_fst).difference();
let stream = add_words_fst.op().add(diff).union(); let stream = add_words_fst.op().add(diff).union();
@ -79,31 +70,38 @@ pub fn merge_grenad_entries(
let main_sender = sender.main(); let main_sender = sender.main();
main_sender.write_words_fst(&words_fst_mmap).unwrap(); main_sender.write_words_fst(&words_fst_mmap).unwrap();
} }
MergerOperation::ExactWordDocidsMerger(merger) => {
merge_and_send_docids(
merger,
index.exact_word_docids.remap_types(),
rtxn,
&mut buffer,
sender.docids::<ExactWordDocids>(),
|_key| Ok(()),
|_key| Ok(()),
)?;
}
MergerOperation::WordFidDocidsMerger(merger) => { MergerOperation::WordFidDocidsMerger(merger) => {
let word_docids_sender = sender.word_fid_docids(); merge_and_send_docids(
let database = index.word_fid_docids.remap_types::<Bytes, Bytes>(); merger,
index.word_fid_docids.remap_types(),
/// TODO manage the error correctly rtxn,
let mut merger_iter = merger.into_stream_merger_iter().unwrap(); &mut buffer,
sender.docids::<WordFidDocids>(),
// TODO manage the error correctly |_key| Ok(()),
while let Some((key, deladd)) = merger_iter.next().unwrap() { |_key| Ok(()),
let current = database.get(rtxn, key)?; )?;
let deladd: &KvReaderDelAdd = deladd.into(); }
let del = deladd.get(DelAdd::Deletion); MergerOperation::WordPositionDocidsMerger(merger) => {
let add = deladd.get(DelAdd::Addition); merge_and_send_docids(
merger,
match merge_cbo_bitmaps(current, del, add)? { index.word_position_docids.remap_types(),
Operation::Write(bitmap) => { rtxn,
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); &mut buffer,
word_docids_sender.write(key, value).unwrap(); sender.docids::<WordPositionDocids>(),
} |_key| Ok(()),
Operation::Delete => { |_key| Ok(()),
word_docids_sender.delete(key).unwrap(); )?;
}
Operation::Ignore => (),
}
}
} }
MergerOperation::InsertDocument { docid, document } => { MergerOperation::InsertDocument { docid, document } => {
documents_ids.insert(docid); documents_ids.insert(docid);
@ -128,6 +126,39 @@ pub fn merge_grenad_entries(
Ok(()) Ok(())
} }
fn merge_and_send_docids<D: DatabaseType>(
merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
database: Database<Bytes, Bytes>,
rtxn: &RoTxn<'_>,
buffer: &mut Vec<u8>,
word_docids_sender: DocidsSender<'_, D>,
mut add_key: impl FnMut(&[u8]) -> fst::Result<()>,
mut del_key: impl FnMut(&[u8]) -> fst::Result<()>,
) -> Result<()> {
let mut merger_iter = merger.into_stream_merger_iter().unwrap();
while let Some((key, deladd)) = merger_iter.next().unwrap() {
let current = database.get(rtxn, key)?;
let deladd: &KvReaderDelAdd = deladd.into();
let del = deladd.get(DelAdd::Deletion);
let add = deladd.get(DelAdd::Addition);
match merge_cbo_bitmaps(current, del, add)? {
Operation::Write(bitmap) => {
let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
word_docids_sender.write(key, value).unwrap();
add_key(key)?;
}
Operation::Delete => {
word_docids_sender.delete(key).unwrap();
del_key(key)?;
}
Operation::Ignore => (),
}
}
Ok(())
}
enum Operation { enum Operation {
Write(RoaringBitmap), Write(RoaringBitmap),
Delete, Delete,