wip

2025-03-13 12:21:40 +08:00 · 2025-01-27 18:22:07 +01:00 · 2025-01-27 18:22:07 +01:00 · e017986dde
commit e017986dde
parent ad0765ffa4
6 changed files with 69 additions and 56 deletions
--- a/crates/dump/src/lib.rs
+++ b/crates/dump/src/lib.rs
@ -1,7 +1,8 @@
 #![allow(clippy::type_complexity)]
 #![allow(clippy::wrong_self_convention)]

-use meilisearch_types::batches::BatchId;
+use meilisearch_types::batch_view::BatchView;
+use meilisearch_types::batches::{Batch, BatchId};
 use meilisearch_types::error::ResponseError;
 use meilisearch_types::keys::Key;
 use meilisearch_types::milli::update::IndexDocumentsMethod;
@ -91,6 +92,13 @@ pub struct TaskDump {
    pub finished_at: Option<OffsetDateTime>,
 }

+#[derive(Debug, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BatchDump {
+    pub original: Batch,
+    pub tasks: RoaringBitmap,
+}
+
 // A `Kind` specific version made for the dump. If modified you may break the dump.
 #[derive(Debug, PartialEq, Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
--- a/crates/dump/src/reader/mod.rs
+++ b/crates/dump/src/reader/mod.rs
@ -101,6 +101,16 @@ impl DumpReader {
        }
    }

+    pub fn batches(&mut self) -> Result<Box<dyn Iterator<Item = Result<v6::Batch>> + '_>> {
+        match self {
+            DumpReader::Current(current) => Ok(current.dumps()),
+            // There was no batches in the previous version
+            DumpReader::Compat(_compat) => {
+                Ok(Box::new(std::iter::empty()) as Box<dyn Iterator<Item = Result<v6::Batch>> + '_>)
+            }
+        }
+    }
+
    pub fn keys(&mut self) -> Result<Box<dyn Iterator<Item = Result<v6::Key>> + '_>> {
        match self {
            DumpReader::Current(current) => Ok(current.keys()),
--- a/crates/dump/src/reader/v6/mod.rs
+++ b/crates/dump/src/reader/v6/mod.rs
@ -18,6 +18,7 @@ pub type Checked = meilisearch_types::settings::Checked;
 pub type Unchecked = meilisearch_types::settings::Unchecked;

 pub type Task = crate::TaskDump;
+pub type Batch = crate::BatchDump;
 pub type Key = meilisearch_types::keys::Key;
 pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures;

@ -48,6 +49,7 @@ pub struct V6Reader {
    instance_uid: Option<Uuid>,
    metadata: Metadata,
    tasks: BufReader<File>,
+    batches: BufReader<File>,
    keys: BufReader<File>,
    features: Option<RuntimeTogglableFeatures>,
 }
@ -82,6 +84,7 @@ impl V6Reader {
            metadata: serde_json::from_reader(&*meta_file)?,
            instance_uid,
            tasks: BufReader::new(File::open(dump.path().join("tasks").join("queue.jsonl"))?),
+            batches: BufReader::new(File::open(dump.path().join("batches").join("queue.jsonl"))?),
            keys: BufReader::new(File::open(dump.path().join("keys.jsonl"))?),
            features,
            dump,
@ -124,7 +127,7 @@ impl V6Reader {
        &mut self,
    ) -> Box<dyn Iterator<Item = Result<(Task, Option<Box<super::UpdateFile>>)>> + '_> {
        Box::new((&mut self.tasks).lines().map(|line| -> Result<_> {
-            let task: Task = serde_json::from_str(&line?).unwrap();
+            let task: Task = serde_json::from_str(&line?)?;

            let update_file_path = self
                .dump
@ -145,6 +148,14 @@ impl V6Reader {
        }))
    }

+    pub fn dumps(&mut self) -> Box<dyn Iterator<Item = Result<Batch>> + '_> {
+        Box::new(
+            (&mut self.batches)
+                .lines()
+                .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }),
+        )
+    }
+
    pub fn keys(&mut self) -> Box<dyn Iterator<Item = Result<Key>> + '_> {
        Box::new(
            (&mut self.keys).lines().map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }),
--- a/crates/dump/src/writer.rs
+++ b/crates/dump/src/writer.rs
@ -14,7 +14,7 @@ use time::OffsetDateTime;
 use uuid::Uuid;

 use crate::reader::Document;
-use crate::{IndexMetadata, Metadata, Result, TaskDump, CURRENT_DUMP_VERSION};
+use crate::{BatchDump, IndexMetadata, Metadata, Result, TaskDump, CURRENT_DUMP_VERSION};

 pub struct DumpWriter {
    dir: TempDir,
@ -174,7 +174,7 @@ impl BatchWriter {

    /// Pushes batches in the dump.
    /// The batches doesn't contains any private information thus we don't need a special type like with the tasks.
-    pub fn push_batch(&mut self, batch: &BatchView) -> Result<()> {
+    pub fn push_batch(&mut self, batch: &BatchDump) -> Result<()> {
        self.queue.write_all(&serde_json::to_vec(batch)?)?;
        self.queue.write_all(b"\n")?;
        Ok(())
--- a/crates/index-scheduler/src/batch.rs
+++ b/crates/index-scheduler/src/batch.rs
@ -26,7 +26,7 @@ use std::sync::atomic::Ordering;

 use bumpalo::collections::CollectIn;
 use bumpalo::Bump;
-use dump::IndexMetadata;
+use dump::{BatchDump, IndexMetadata};
 use meilisearch_types::batch_view::BatchView;
 use meilisearch_types::batches::BatchId;
 use meilisearch_types::heed::{RoTxn, RwTxn};
@ -881,7 +881,12 @@ impl IndexScheduler {
                        batch.started_at = started_at;
                        batch.finished_at = Some(finished_at);
                    }
-                    dump_batches.push_batch(&BatchView::from_batch(&batch))?;
+                    // a missing or empty batch shouldn't exists but if it happens we should just skip it
+                    if let Some(tasks) = self.batch_to_tasks_mapping.get(&rtxn, &batch.uid)? {
+                        if !tasks.is_empty() {
+                            dump_batches.push_batch(&BatchDump { original: batch, tasks })?;
+                        }
+                    }
                    atomic.fetch_add(1, Ordering::Relaxed);
                }
                dump_batches.flush()?;
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@ -43,7 +43,7 @@ use std::sync::atomic::{AtomicBool, AtomicU32};
 use std::sync::{Arc, RwLock};
 use std::time::Duration;

-use dump::{KindDump, TaskDump, UpdateFile};
+use dump::{BatchDump, KindDump, TaskDump, UpdateFile};
 pub use error::Error;
 pub use features::RoFeatures;
 use file_store::FileStore;
@ -1996,6 +1996,10 @@ pub struct Dump<'a> {
    statuses: HashMap<Status, RoaringBitmap>,
    kinds: HashMap<Kind, RoaringBitmap>,

+    batch_indexes: HashMap<String, RoaringBitmap>,
+    batch_statuses: HashMap<Status, RoaringBitmap>,
+    batch_kinds: HashMap<Kind, RoaringBitmap>,
+
    batch_to_tasks_mapping: HashMap<TaskId, RoaringBitmap>,
 }

@ -2010,6 +2014,9 @@ impl<'a> Dump<'a> {
            indexes: HashMap::new(),
            statuses: HashMap::new(),
            kinds: HashMap::new(),
+            batch_indexes: HashMap::new(),
+            batch_statuses: HashMap::new(),
+            batch_kinds: HashMap::new(),
            batch_to_tasks_mapping: HashMap::new(),
        })
    }
@ -2171,63 +2178,35 @@ impl<'a> Dump<'a> {

    /// Register a new task coming from a dump in the scheduler.
    /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running.
-    pub fn register_dumped_batch(&mut self, batch: BatchView) -> Result<()> {
-        let batch = Batch {
-            uid: batch.uid,
-            // Batch cannot be processing while we import it
-            progress: None,
-            details: batch.details,
-            stats: batch.stats,
-            started_at: batch.started_at,
-            finished_at: batch.finished_at,
-        };
+    pub fn register_dumped_batch(&mut self, dump: BatchDump) -> Result<()> {
+        let BatchDump { original, tasks } = dump;

-        self.index_scheduler.all_batches.put(&mut self.wtxn, &batch.uid, &batch)?;
-
-        for index in batch.indexes() {
-            match self.indexes.get_mut(index) {
-                Some(bitmap) => {
-                    bitmap.insert(batch.uid);
-                }
-                None => {
-                    let mut bitmap = RoaringBitmap::new();
-                    bitmap.insert(batch.uid);
-                    self.indexes.insert(index.to_string(), bitmap);
-                }
-            };
-        }
+        self.index_scheduler.all_batches.put(&mut self.wtxn, &original.uid, &original)?;
+        self.index_scheduler..put(&mut self.wtxn, &original.uid, &original)?;

        utils::insert_task_datetime(
            &mut self.wtxn,
-            self.index_scheduler.enqueued_at,
-            batch.enqueued_at,
-            batch.uid,
+            self.index_scheduler.batch_enqueued_at,
+            original.started_at, // TODO: retrieve the enqueued_at from the dump
+            original.uid,
        )?;

-        // we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change
-        if matches!(batch.status, Status::Succeeded | Status::Failed | Status::Canceled) {
-            if let Some(started_at) = batch.started_at {
-                utils::insert_task_datetime(
-                    &mut self.wtxn,
-                    self.index_scheduler.started_at,
-                    started_at,
-                    batch.uid,
-                )?;
-            }
-            if let Some(finished_at) = batch.finished_at {
-                utils::insert_task_datetime(
-                    &mut self.wtxn,
-                    self.index_scheduler.finished_at,
-                    finished_at,
-                    batch.uid,
-                )?;
-            }
+        utils::insert_task_datetime(
+            &mut self.wtxn,
+            self.index_scheduler.started_at,
+            original.started_at,
+            original.uid,
+        )?;
+        if let Some(finished_at) = original.finished_at {
+            utils::insert_task_datetime(
+                &mut self.wtxn,
+                self.index_scheduler.finished_at,
+                finished_at,
+                original.uid,
+            )?;
        }

-        self.statuses.entry(batch.status).or_default().insert(batch.uid);
-        self.kinds.entry(batch.kind.as_kind()).or_default().insert(batch.uid);
-
-        Ok(batch)
+        Ok(())
    }

    /// Commit all the changes and exit the importing dump state