meilisearch/crates/milli/src/update/new/indexer/partial_dump.rs

use std::ops::DerefMut;

use rayon::iter::IndexedParallelIterator;
use serde_json::value::RawValue;

use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt};
use crate::documents::PrimaryKey;
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
use crate::update::new::document::{DocumentFromVersions, Versions};
use crate::update::new::{DocumentChange, Insertion};
use crate::{Error, InternalError, Result, UserError};

pub struct PartialDump<I> {
    iter: I,
}

impl<I> PartialDump<I> {
    pub fn new_from_jsonlines(iter: I) -> Self {
        PartialDump { iter }
    }

    pub fn into_changes<'index>(
        self,
        concurrent_available_ids: &'index ConcurrentAvailableIds,
        primary_key: &'index PrimaryKey,
    ) -> PartialDumpChanges<'index, I> {
        /// Note for future self:
        ///   - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
        PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }
    }
}

pub struct PartialDumpChanges<'doc, I> {
    iter: I,
    concurrent_available_ids: &'doc ConcurrentAvailableIds,
    primary_key: &'doc PrimaryKey<'doc>,
}

impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>
where
    Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,
{
    type Item = Box<RawValue>;

    fn iter(
        &self,
        chunk_size: usize,
    ) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
        self.iter.clone().chunks(chunk_size)
    }

    fn item_to_document_change<'doc, T: MostlySend + 'doc>(
        &'doc self,
        context: &'doc DocumentChangeContext<T>,
        document: &'doc Self::Item,
    ) -> Result<Option<DocumentChange<'doc>>>
    where
        'index: 'doc,
    {
        let doc_alloc = &context.doc_alloc;
        let docid = match self.concurrent_available_ids.next() {
            Some(id) => id,
            None => return Err(Error::UserError(UserError::DocumentLimitReached)),
        };

        let mut fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
        let fields_ids_map = fields_ids_map.deref_mut();

        let document = doc_alloc.alloc_str(document.get());
        let document: &RawValue = unsafe { std::mem::transmute(document) };

        let external_document_id =
            self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
        let external_document_id = external_document_id.to_de();

        let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
            .map_err(InternalError::SerdeJson)?;

        let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
        Ok(Some(DocumentChange::Insertion(insertion)))
    }

    fn len(&self) -> usize {
        self.iter.len()
    }
}
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`use std::ops::DerefMut;`
Fix the errors when using the try_map_try_init method 2024-09-29 16:46:58 +02:00
Rename and use the try_arc_for_each_try_init method 2024-09-29 17:42:26 +02:00			`use rayon::iter::IndexedParallelIterator;`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`use serde_json::value::RawValue;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
Add borrow_mut_or_yield extension method 2024-10-16 17:36:41 +02:00			`use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt};`
Fixup imports 2024-10-21 10:39:31 +02:00			`use crate::documents::PrimaryKey;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`use crate::update::concurrent_available_ids::ConcurrentAvailableIds;`
Fixup imports 2024-10-21 10:39:31 +02:00			`use crate::update::new::document::{DocumentFromVersions, Versions};`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`use crate::update::new::{DocumentChange, Insertion};`
			`use crate::{Error, InternalError, Result, UserError};`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
Renaming the indexers 2024-09-02 14:42:27 +02:00			`pub struct PartialDump<I> {`
Find a temporary solution to par into iter on an HashMap Spoiler: Do not use an HashMap but drain it into a Vec 2024-09-02 19:39:48 +02:00			`iter: I,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`

Renaming the indexers 2024-09-02 14:42:27 +02:00			`impl<I> PartialDump<I> {`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`pub fn new_from_jsonlines(iter: I) -> Self {`
Renaming the indexers 2024-09-02 14:42:27 +02:00			`PartialDump { iter }`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00
			`pub fn into_changes<'index>(`
			`self,`
			`concurrent_available_ids: &'index ConcurrentAvailableIds,`
			`primary_key: &'index PrimaryKey,`
			`) -> PartialDumpChanges<'index, I> {`
			`/// Note for future self:`
			/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
			`PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }`
			`}`
			`}`

			`pub struct PartialDumpChanges<'doc, I> {`
			`iter: I,`
			`concurrent_available_ids: &'doc ConcurrentAvailableIds,`
			`primary_key: &'doc PrimaryKey<'doc>,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`

Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`where`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`{`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`type Item = Box<RawValue>;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
Add chunking 2024-10-16 15:44:04 +02:00			`fn iter(`
			`&self,`
			`chunk_size: usize,`
			`) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {`
			`self.iter.clone().chunks(chunk_size)`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`}`

			`fn item_to_document_change<'doc, T: MostlySend + 'doc>(`
			`&'doc self,`
			`context: &'doc DocumentChangeContext<T>,`
Add chunking 2024-10-16 15:44:04 +02:00			`document: &'doc Self::Item,`
Introduce the new update by function 2024-10-14 15:41:10 +02:00			`) -> Result<Option<DocumentChange<'doc>>>`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`where`
			`'index: 'doc,`
			`{`
			`let doc_alloc = &context.doc_alloc;`
			`let docid = match self.concurrent_available_ids.next() {`
			`Some(id) => id,`
			`None => return Err(Error::UserError(UserError::DocumentLimitReached)),`
			`};`

Add borrow_mut_or_yield extension method 2024-10-16 17:36:41 +02:00			`let mut fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();`
Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`let fields_ids_map = fields_ids_map.deref_mut();`

			`let document = doc_alloc.alloc_str(document.get());`
			`let document: &RawValue = unsafe { std::mem::transmute(document) };`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
external changes 2024-10-14 15:41:31 +02:00			`let external_document_id =`
			`self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;`
			`let external_document_id = external_document_id.to_de();`

Introduce indexer level bumpalo 2024-10-03 18:08:09 +02:00			`let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)`
			`.map_err(InternalError::SerdeJson)?;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
Allow random access to fields in documents 2024-10-28 14:23:38 +01:00			`let insertion = Insertion::create(docid, external_document_id, Versions::single(document));`
Introduce the new update by function 2024-10-14 15:41:10 +02:00			`Ok(Some(DocumentChange::Insertion(insertion)))`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`
Add progress 2024-11-04 15:10:40 +01:00
			`fn len(&self) -> usize {`
			`self.iter.len()`
			`}`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`