meilisearch/milli/src/update/new/indexer/partial_dump.rs

use rayon::iter::IndexedParallelIterator;

use super::DocumentChanges;
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};

pub struct PartialDump<I> {
    iter: I,
}

impl<I> PartialDump<I> {
    pub fn new_from_jsonlines(iter: I) -> Self {
        PartialDump { iter }
    }
}

impl<'p, I> DocumentChanges<'p> for PartialDump<I>
where
    I: IndexedParallelIterator<Item = Object> + Clone + 'p,
{
    type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);

    /// Note for future self:
    ///   - the field ids map must already be valid so you must have to generate it beforehand.
    ///   - We should probably expose another method that generates the fields ids map from an iterator of JSON objects.
    ///   - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
    fn document_changes(
        self,
        _fields_ids_map: &mut FieldsIdsMap,
        param: Self::Parameter,
    ) -> Result<
        impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
            + Clone
            + 'p,
    > {
        let (fields_ids_map, concurrent_available_ids, primary_key) = param;

        Ok(self.iter.map(|object| {
            let docid = match concurrent_available_ids.next() {
                Some(id) => id,
                None => return Err(Error::UserError(UserError::DocumentLimitReached)),
            };

            let mut writer = KvWriterFieldId::memory();
            object.iter().for_each(|(key, value)| {
                let key = fields_ids_map.id(key).unwrap();
                /// TODO better error management
                let value = serde_json::to_vec(&value).unwrap();
                /// TODO it is not ordered
                writer.insert(key, value).unwrap();
            });

            let document = writer.into_boxed();
            let external_docid = match primary_key.document_id(&document, fields_ids_map)? {
                Ok(document_id) => Ok(document_id),
                Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
                Err(DocumentIdExtractionError::MissingDocumentId) => {
                    Err(UserError::MissingDocumentId {
                        primary_key: primary_key.name().to_string(),
                        document: all_obkv_to_json(&document, fields_ids_map)?,
                    })
                }
                Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
                    Err(UserError::TooManyDocumentIds {
                        primary_key: primary_key.name().to_string(),
                        document: all_obkv_to_json(&document, fields_ids_map)?,
                    })
                }
            }?;

            let insertion = Insertion::create(docid, document);
            Ok(DocumentChange::Insertion(insertion))
        }))
    }
}
Appease some of clippy warnings 2024-09-30 16:08:29 +02:00			`use rayon::iter::IndexedParallelIterator;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00
I push for Many 2024-09-02 15:10:21 +02:00			`use super::DocumentChanges;`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`use crate::documents::{DocumentIdExtractionError, PrimaryKey};`
			`use crate::update::concurrent_available_ids::ConcurrentAvailableIds;`
			`use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};`
			`use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};`

Renaming the indexers 2024-09-02 14:42:27 +02:00			`pub struct PartialDump<I> {`
Find a temporary solution to par into iter on an HashMap Spoiler: Do not use an HashMap but drain it into a Vec 2024-09-02 19:39:48 +02:00			`iter: I,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`

Renaming the indexers 2024-09-02 14:42:27 +02:00			`impl<I> PartialDump<I> {`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`pub fn new_from_jsonlines(iter: I) -> Self {`
Renaming the indexers 2024-09-02 14:42:27 +02:00			`PartialDump { iter }`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}`
			`}`

I push for Many 2024-09-02 15:10:21 +02:00			`impl<'p, I> DocumentChanges<'p> for PartialDump<I>`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`where`
Expose an IndexedParallelIterator to the index function 2024-09-24 17:24:50 +02:00			`I: IndexedParallelIterator<Item = Object> + Clone + 'p,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`{`
			`type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);`

			`/// Note for future self:`
			`/// - the field ids map must already be valid so you must have to generate it beforehand.`
			`/// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects.`
			/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
			`fn document_changes(`
			`self,`
Use the GlobalFieldsIdsMap everywhere and write it to disk Co-authored-by: Dureuill <louis@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com> 2024-09-03 12:01:01 +02:00			`_fields_ids_map: &mut FieldsIdsMap,`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`param: Self::Parameter,`
WIP using try_map_try_init 2024-09-26 18:59:28 +02:00			`) -> Result<`
			`impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>`
			`+ Clone`
			`+ 'p,`
			`> {`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`let (fields_ids_map, concurrent_available_ids, primary_key) = param;`

Expose an IndexedParallelIterator to the index function 2024-09-24 17:24:50 +02:00			`Ok(self.iter.map(\|object\| {`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`let docid = match concurrent_available_ids.next() {`
			`Some(id) => id,`
			`None => return Err(Error::UserError(UserError::DocumentLimitReached)),`
			`};`

			`let mut writer = KvWriterFieldId::memory();`
			`object.iter().for_each(\|(key, value)\| {`
			`let key = fields_ids_map.id(key).unwrap();`
			`/// TODO better error management`
			`let value = serde_json::to_vec(&value).unwrap();`
Renaming the indexers 2024-09-02 14:42:27 +02:00			`/// TODO it is not ordered`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`writer.insert(key, value).unwrap();`
			`});`

			`let document = writer.into_boxed();`
			`let external_docid = match primary_key.document_id(&document, fields_ids_map)? {`
			`Ok(document_id) => Ok(document_id),`
			`Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),`
			`Err(DocumentIdExtractionError::MissingDocumentId) => {`
			`Err(UserError::MissingDocumentId {`
			`primary_key: primary_key.name().to_string(),`
			`document: all_obkv_to_json(&document, fields_ids_map)?,`
			`})`
			`}`
			`Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {`
			`Err(UserError::TooManyDocumentIds {`
			`primary_key: primary_key.name().to_string(),`
			`document: all_obkv_to_json(&document, fields_ids_map)?,`
			`})`
			`}`
			`}?;`

WIP geo fields 2024-09-12 18:01:02 +02:00			`let insertion = Insertion::create(docid, document);`
Remove the useless option from the document changes 2024-09-02 15:21:00 +02:00			`Ok(DocumentChange::Insertion(insertion))`
Move the indexers into their own modules 2024-09-02 10:42:19 +02:00			`}))`
			`}`
			`}`