mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Add chunking
This commit is contained in:
parent
c1fcb2ebc6
commit
cd378e5bd2
@ -54,12 +54,20 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
|
|||||||
))))
|
))))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process<'doc>(
|
||||||
&self,
|
&self,
|
||||||
change: DocumentChange,
|
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||||
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change)
|
for change in changes {
|
||||||
|
let change = change?;
|
||||||
|
FacetedDocidsExtractor::extract_document_change(
|
||||||
|
context,
|
||||||
|
self.attributes_to_extract,
|
||||||
|
change,
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -325,12 +325,16 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
|
|||||||
))))
|
))))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process<'doc>(
|
||||||
&self,
|
&self,
|
||||||
change: DocumentChange,
|
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||||
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
|
for change in changes {
|
||||||
|
let change = change?;
|
||||||
|
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,12 +55,16 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
|||||||
))))
|
))))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process<'doc>(
|
||||||
&self,
|
&self,
|
||||||
change: DocumentChange,
|
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||||
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
EX::extract_document_change(context, self.tokenizer, change)
|
for change in changes {
|
||||||
|
let change = change?;
|
||||||
|
EX::extract_document_change(context, self.tokenizer, change)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -323,7 +323,7 @@ pub trait Extractor<'extractor>: Sync {
|
|||||||
|
|
||||||
fn process<'doc>(
|
fn process<'doc>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
change: DocumentChange<'doc>,
|
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||||
context: &'doc DocumentChangeContext<Self::Data>,
|
context: &'doc DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()>;
|
) -> Result<()>;
|
||||||
}
|
}
|
||||||
@ -332,13 +332,13 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
|||||||
>: Sync {
|
>: Sync {
|
||||||
type Item: Send;
|
type Item: Send;
|
||||||
|
|
||||||
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item>;
|
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>>;
|
||||||
|
|
||||||
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
||||||
T: MostlySend>(
|
T: MostlySend>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
context: &'doc DocumentChangeContext<T>,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
item: Self::Item,
|
item: &'doc Self::Item,
|
||||||
) -> Result<Option<DocumentChange<'doc>>> where 'pl: 'doc // the payload must survive the process calls
|
) -> Result<Option<DocumentChange<'doc>>> where 'pl: 'doc // the payload must survive the process calls
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
@ -356,6 +356,8 @@ pub struct IndexingContext<
|
|||||||
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const CHUNK_SIZE: usize = 100;
|
||||||
|
|
||||||
pub fn for_each_document_change<
|
pub fn for_each_document_change<
|
||||||
'pl, // covariant lifetime of the underlying payload
|
'pl, // covariant lifetime of the underlying payload
|
||||||
'extractor, // invariant lifetime of extractor_alloc
|
'extractor, // invariant lifetime of extractor_alloc
|
||||||
@ -386,7 +388,7 @@ where
|
|||||||
extractor_alloc.0.get_mut().reset();
|
extractor_alloc.0.get_mut().reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
let pi = document_changes.iter();
|
let pi = document_changes.iter(CHUNK_SIZE);
|
||||||
pi.try_arc_for_each_try_init(
|
pi.try_arc_for_each_try_init(
|
||||||
|| {
|
|| {
|
||||||
DocumentChangeContext::new(
|
DocumentChangeContext::new(
|
||||||
@ -400,17 +402,16 @@ where
|
|||||||
move |index_alloc| extractor.init_data(index_alloc),
|
move |index_alloc| extractor.init_data(index_alloc),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
|context, item| {
|
|context, items| {
|
||||||
// Clean up and reuse the document-specific allocator
|
// Clean up and reuse the document-specific allocator
|
||||||
context.doc_alloc.reset();
|
context.doc_alloc.reset();
|
||||||
|
|
||||||
let Some(change) =
|
let items = items.as_ref();
|
||||||
document_changes.item_to_document_change(context, item).map_err(Arc::new)?
|
let changes = items.iter().filter_map(|item| {
|
||||||
else {
|
document_changes.item_to_document_change(context, item).transpose()
|
||||||
return Ok(());
|
});
|
||||||
};
|
|
||||||
|
|
||||||
let res = extractor.process(change, context).map_err(Arc::new);
|
let res = extractor.process(changes, context).map_err(Arc::new);
|
||||||
|
|
||||||
// send back the doc_alloc in the pool
|
// send back the doc_alloc in the pool
|
||||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use bumpalo::collections::CollectIn;
|
use bumpalo::collections::CollectIn;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
|
use rayon::iter::IndexedParallelIterator;
|
||||||
|
use rayon::slice::ParallelSlice as _;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend};
|
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend};
|
||||||
@ -44,8 +45,11 @@ pub struct DocumentDeletionChanges<'indexer> {
|
|||||||
impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||||
type Item = DocumentId;
|
type Item = DocumentId;
|
||||||
|
|
||||||
fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator<Item = Self::Item> {
|
fn iter(
|
||||||
self.to_delete.into_par_iter().copied()
|
&self,
|
||||||
|
chunk_size: usize,
|
||||||
|
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||||
|
self.to_delete.par_chunks(chunk_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn item_to_document_change<
|
fn item_to_document_change<
|
||||||
@ -54,12 +58,12 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
|||||||
>(
|
>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
context: &'doc DocumentChangeContext<T>,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
docid: Self::Item,
|
docid: &'doc Self::Item,
|
||||||
) -> Result<Option<DocumentChange<'doc>>>
|
) -> Result<Option<DocumentChange<'doc>>>
|
||||||
where
|
where
|
||||||
'pl: 'doc, // the payload must survive the process calls
|
'pl: 'doc, // the payload must survive the process calls
|
||||||
{
|
{
|
||||||
let current = context.index.document(&context.txn, docid)?;
|
let current = context.index.document(&context.txn, *docid)?;
|
||||||
|
|
||||||
let external_document_id = self.primary_key.extract_docid_from_db(
|
let external_document_id = self.primary_key.extract_docid_from_db(
|
||||||
current,
|
current,
|
||||||
@ -69,7 +73,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
|||||||
|
|
||||||
let external_document_id = external_document_id.to_bump(&context.doc_alloc);
|
let external_document_id = external_document_id.to_bump(&context.doc_alloc);
|
||||||
|
|
||||||
Ok(Some(DocumentChange::Deletion(Deletion::create(docid, external_document_id))))
|
Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id))))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,12 +122,15 @@ mod test {
|
|||||||
Ok(DeletionWithData { deleted })
|
Ok(DeletionWithData { deleted })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process<'doc>(
|
||||||
&self,
|
&self,
|
||||||
change: DocumentChange,
|
changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>,
|
||||||
context: &DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
|
for change in changes {
|
||||||
|
let change = change?;
|
||||||
context.data.deleted.borrow_mut().insert(change.docid());
|
context.data.deleted.borrow_mut().insert(change.docid());
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ use bumpalo::Bump;
|
|||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use rayon::iter::IntoParallelIterator;
|
use rayon::iter::IntoParallelIterator;
|
||||||
|
use rayon::slice::ParallelSlice;
|
||||||
use serde_json::value::RawValue;
|
use serde_json::value::RawValue;
|
||||||
use IndexDocumentsMethod as Idm;
|
use IndexDocumentsMethod as Idm;
|
||||||
|
|
||||||
@ -209,16 +210,19 @@ impl<'pl> DocumentOperation<'pl> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
||||||
type Item = &'pl (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]));
|
type Item = (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]));
|
||||||
|
|
||||||
fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator<Item = Self::Item> {
|
fn iter(
|
||||||
self.docids_version_offsets.into_par_iter()
|
&self,
|
||||||
|
chunk_size: usize,
|
||||||
|
) -> impl rayon::prelude::IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||||
|
self.docids_version_offsets.par_chunks(chunk_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
context: &'doc DocumentChangeContext<T>,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
item: Self::Item,
|
item: &'doc Self::Item,
|
||||||
) -> Result<Option<DocumentChange<'doc>>>
|
) -> Result<Option<DocumentChange<'doc>>>
|
||||||
where
|
where
|
||||||
'pl: 'doc,
|
'pl: 'doc,
|
||||||
|
@ -5,8 +5,8 @@ use std::thread::{self, Builder};
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use document_changes::{
|
use document_changes::{
|
||||||
for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, RefCellExt,
|
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
||||||
ThreadLocal,
|
IndexingContext, RefCellExt, ThreadLocal,
|
||||||
};
|
};
|
||||||
pub use document_deletion::DocumentDeletion;
|
pub use document_deletion::DocumentDeletion;
|
||||||
pub use document_operation::DocumentOperation;
|
pub use document_operation::DocumentOperation;
|
||||||
@ -33,7 +33,7 @@ use crate::update::new::channel::ExtractorSender;
|
|||||||
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
||||||
use crate::update::settings::InnerIndexSettings;
|
use crate::update::settings::InnerIndexSettings;
|
||||||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
||||||
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||||
|
|
||||||
pub(crate) mod de;
|
pub(crate) mod de;
|
||||||
pub mod document_changes;
|
pub mod document_changes;
|
||||||
@ -56,10 +56,10 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
Ok(FullySend(()))
|
Ok(FullySend(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process<'doc>(
|
||||||
&self,
|
&self,
|
||||||
change: DocumentChange,
|
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||||
context: &document_changes::DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut document_buffer = Vec::new();
|
let mut document_buffer = Vec::new();
|
||||||
|
|
||||||
@ -67,6 +67,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
let new_fields_ids_map = &*new_fields_ids_map;
|
let new_fields_ids_map = &*new_fields_ids_map;
|
||||||
let new_fields_ids_map = new_fields_ids_map.local_map();
|
let new_fields_ids_map = new_fields_ids_map.local_map();
|
||||||
|
|
||||||
|
for change in changes {
|
||||||
|
let change = change?;
|
||||||
let external_docid = change.external_docid().to_owned();
|
let external_docid = change.external_docid().to_owned();
|
||||||
|
|
||||||
// document but we need to create a function that collects and compresses documents.
|
// document but we need to create a function that collects and compresses documents.
|
||||||
@ -93,6 +95,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,14 +45,17 @@ where
|
|||||||
{
|
{
|
||||||
type Item = Box<RawValue>;
|
type Item = Box<RawValue>;
|
||||||
|
|
||||||
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item> {
|
fn iter(
|
||||||
self.iter.clone()
|
&self,
|
||||||
|
chunk_size: usize,
|
||||||
|
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||||
|
self.iter.clone().chunks(chunk_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
context: &'doc DocumentChangeContext<T>,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
document: Self::Item,
|
document: &'doc Self::Item,
|
||||||
) -> Result<Option<DocumentChange<'doc>>>
|
) -> Result<Option<DocumentChange<'doc>>>
|
||||||
where
|
where
|
||||||
'index: 'doc,
|
'index: 'doc,
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
use std::collections::BTreeMap;
|
|
||||||
|
|
||||||
use raw_collections::RawMap;
|
use raw_collections::RawMap;
|
||||||
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
|
use rayon::iter::IndexedParallelIterator;
|
||||||
|
use rayon::slice::ParallelSlice as _;
|
||||||
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
|
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
@ -12,8 +11,8 @@ use crate::documents::PrimaryKey;
|
|||||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||||
use crate::update::new::document::DocumentFromVersions;
|
use crate::update::new::document::DocumentFromVersions;
|
||||||
use crate::update::new::document_change::Versions;
|
use crate::update::new::document_change::Versions;
|
||||||
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, KvWriterFieldId, Update};
|
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update};
|
||||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, GlobalFieldsIdsMap, Object, Result, UserError};
|
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
||||||
|
|
||||||
pub struct UpdateByFunction {
|
pub struct UpdateByFunction {
|
||||||
documents: RoaringBitmap,
|
documents: RoaringBitmap,
|
||||||
@ -76,14 +75,17 @@ impl UpdateByFunction {
|
|||||||
impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
||||||
type Item = u32;
|
type Item = u32;
|
||||||
|
|
||||||
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item> {
|
fn iter(
|
||||||
self.documents.par_iter().copied()
|
&self,
|
||||||
|
chunk_size: usize,
|
||||||
|
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||||
|
self.documents.as_slice().par_chunks(chunk_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||||
&self,
|
&self,
|
||||||
context: &'doc DocumentChangeContext<T>,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
docid: Self::Item,
|
docid: &'doc Self::Item,
|
||||||
) -> Result<Option<DocumentChange<'doc>>>
|
) -> Result<Option<DocumentChange<'doc>>>
|
||||||
where
|
where
|
||||||
'index: 'doc,
|
'index: 'doc,
|
||||||
@ -97,6 +99,8 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
|||||||
..
|
..
|
||||||
} = context;
|
} = context;
|
||||||
|
|
||||||
|
let docid = *docid;
|
||||||
|
|
||||||
// safety: Both documents *must* exists in the database as
|
// safety: Both documents *must* exists in the database as
|
||||||
// their IDs comes from the list of documents ids.
|
// their IDs comes from the list of documents ids.
|
||||||
let document = index.document(txn, docid)?;
|
let document = index.document(txn, docid)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user