From 0c68b9ed4cf4f1ad52159576723a31d98eceb654 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 31 Aug 2023 15:56:42 +0200
Subject: [PATCH] WIP making the final snapshot swap

---
 index-scheduler/src/index_mapper/index_map.rs |   5 +
 index-scheduler/src/index_mapper/mod.rs       |   5 +
 index-scheduler/src/lib.rs                    | 359 ++++++++++--------
 index-scheduler/src/lru.rs                    |   6 +
 4 files changed, 216 insertions(+), 159 deletions(-)
diff --git a/index-scheduler/src/index_mapper/index_map.rs b/index-scheduler/src/index_mapper/index_map.rs
index a24213558..5cdadcd3b 100644
--- a/index-scheduler/src/index_mapper/index_map.rs
+++ b/index-scheduler/src/index_mapper/index_map.rs
@@ -295,6 +295,11 @@ impl IndexMap {
             "Attempt to finish deletion of an index that was being closed"
         );
     }
+
+    /// Returns the indexes that were opened by the `IndexMap`.
+    pub fn clear(&mut self) -> Vec<Index> {
+        self.available.clear().into_iter().map(|(_, (_, index))| index).collect()
+    }
 }
 
 /// Create or open an index in the specified path.
diff --git a/index-scheduler/src/index_mapper/mod.rs b/index-scheduler/src/index_mapper/mod.rs
index 7850966cf..00bf023b1 100644
--- a/index-scheduler/src/index_mapper/mod.rs
+++ b/index-scheduler/src/index_mapper/mod.rs
@@ -428,6 +428,11 @@ impl IndexMapper {
         Ok(())
     }
 
+    /// Returns the indexes that were opened by the `IndexMapper`.
+    pub fn clear(&mut self) -> Vec<Index> {
+        self.index_map.write().unwrap().clear()
+    }
+
     /// The stats of an index.
     ///
     /// If available in the cache, they are directly returned.
diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs
index adbc06763..c5734dcfb 100644
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -294,92 +294,13 @@ impl IndexScheduler {
         #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>,
         #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>,
     ) -> Result<Self> {
-        std::fs::create_dir_all(&options.tasks_path)?;
-        std::fs::create_dir_all(&options.update_file_path)?;
-        std::fs::create_dir_all(&options.indexes_path)?;
-        std::fs::create_dir_all(&options.dumps_path)?;
-
-        if cfg!(windows) && options.enable_mdb_writemap {
-            // programmer error if this happens: in normal use passing the option on Windows is an error in main
-            panic!("Windows doesn't support the MDB_WRITEMAP LMDB option");
-        }
-
-        let task_db_size = clamp_to_page_size(options.task_db_size);
-        let budget = if options.indexer_config.skip_index_budget {
-            IndexBudget {
-                map_size: options.index_base_map_size,
-                index_count: options.index_count,
-                task_db_size,
-            }
-        } else {
-            Self::index_budget(
-                &options.tasks_path,
-                options.index_base_map_size,
-                task_db_size,
-                options.index_count,
-            )
-        };
-
-        let env = heed::EnvOpenOptions::new()
-            .max_dbs(11)
-            .map_size(budget.task_db_size)
-            .open(options.tasks_path)?;
-
-        let features = features::FeatureData::new(&env, options.instance_features)?;
-
-        let file_store = FileStore::new(&options.update_file_path)?;
-
-        let mut wtxn = env.write_txn()?;
-        let all_tasks = env.create_database(&mut wtxn, Some(db_name::ALL_TASKS))?;
-        let status = env.create_database(&mut wtxn, Some(db_name::STATUS))?;
-        let kind = env.create_database(&mut wtxn, Some(db_name::KIND))?;
-        let index_tasks = env.create_database(&mut wtxn, Some(db_name::INDEX_TASKS))?;
-        let canceled_by = env.create_database(&mut wtxn, Some(db_name::CANCELED_BY))?;
-        let enqueued_at = env.create_database(&mut wtxn, Some(db_name::ENQUEUED_AT))?;
-        let started_at = env.create_database(&mut wtxn, Some(db_name::STARTED_AT))?;
-        let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
-        wtxn.commit()?;
-
-        // allow unreachable_code to get rids of the warning in the case of a test build.
-        let inner = IndexSchedulerInner {
-            must_stop_processing: MustStopProcessing::default(),
-            processing_tasks: Arc::new(RwLock::new(ProcessingTasks::new())),
-            file_store,
-            all_tasks,
-            status,
-            kind,
-            index_tasks,
-            canceled_by,
-            enqueued_at,
-            started_at,
-            finished_at,
-            index_mapper: IndexMapper::new(
-                &env,
-                options.indexes_path,
-                budget.map_size,
-                options.index_growth_amount,
-                budget.index_count,
-                options.enable_mdb_writemap,
-                options.indexer_config,
-            )?,
-            env,
-            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
-            wake_up: Arc::new(SignalEvent::auto(true)),
-            autobatching_enabled: options.autobatching_enabled,
-            max_number_of_tasks: options.max_number_of_tasks,
-            dumps_path: options.dumps_path,
-            snapshots_path: options.snapshots_path,
-            auth_path: options.auth_path,
-            version_file_path: options.version_file_path,
-            zookeeper: options.zookeeper,
+        let inner = IndexSchedulerInner::new(
+            options,
             #[cfg(test)]
             test_breakpoint_sdr,
             #[cfg(test)]
             planned_failures,
-            #[cfg(test)]
-            run_loop_iteration: Arc::new(RwLock::new(0)),
-            features,
-        };
+        )?;
 
         // initialize the directories we need to process batches.
         if let Some(zookeeper) = &inner.zookeeper {
@@ -428,75 +349,6 @@ impl IndexScheduler {
         Ok(())
     }
 
-    fn index_budget(
-        tasks_path: &Path,
-        base_map_size: usize,
-        mut task_db_size: usize,
-        max_index_count: usize,
-    ) -> IndexBudget {
-        #[cfg(windows)]
-        const DEFAULT_BUDGET: usize = 6 * 1024 * 1024 * 1024 * 1024; // 6 TiB, 1 index
-        #[cfg(not(windows))]
-        const DEFAULT_BUDGET: usize = 80 * 1024 * 1024 * 1024 * 1024; // 80 TiB, 18 indexes
-
-        let budget = if Self::is_good_heed(tasks_path, DEFAULT_BUDGET) {
-            DEFAULT_BUDGET
-        } else {
-            log::debug!("determining budget with dichotomic search");
-            utils::dichotomic_search(DEFAULT_BUDGET / 2, |map_size| {
-                Self::is_good_heed(tasks_path, map_size)
-            })
-        };
-
-        log::debug!("memmap budget: {budget}B");
-        let mut budget = budget / 2;
-        if task_db_size > (budget / 2) {
-            task_db_size = clamp_to_page_size(budget * 2 / 5);
-            log::debug!(
-                "Decreasing max size of task DB to {task_db_size}B due to constrained memory space"
-            );
-        }
-        budget -= task_db_size;
-
-        // won't be mutated again
-        let budget = budget;
-        let task_db_size = task_db_size;
-
-        log::debug!("index budget: {budget}B");
-        let mut index_count = budget / base_map_size;
-        if index_count < 2 {
-            // take a bit less than half than the budget to make sure we can always afford to open an index
-            let map_size = (budget * 2) / 5;
-            // single index of max budget
-            log::debug!("1 index of {map_size}B can be opened simultaneously.");
-            return IndexBudget { map_size, index_count: 1, task_db_size };
-        }
-        // give us some space for an additional index when the cache is already full
-        // decrement is OK because index_count >= 2.
-        index_count -= 1;
-        if index_count > max_index_count {
-            index_count = max_index_count;
-        }
-        log::debug!("Up to {index_count} indexes of {base_map_size}B opened simultaneously.");
-        IndexBudget { map_size: base_map_size, index_count, task_db_size }
-    }
-
-    fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool {
-        if let Ok(env) =
-            heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path)
-        {
-            env.prepare_for_closing().wait();
-            true
-        } else {
-            // We're treating all errors equally here, not only allocation errors.
-            // This means there's a possiblity for the budget to lower due to errors different from allocation errors.
-            // For persistent errors, this is OK as long as the task db is then reopened normally without ignoring the error this time.
-            // For transient errors, this could lead to an instance with too low a budget.
-            // However transient errors are: 1) less likely than persistent errors 2) likely to cause other issues down the line anyway.
-            false
-        }
-    }
-
     /// Start the run loop for the given index scheduler.
     ///
     /// This function will execute in a different thread and must be called
@@ -534,8 +386,6 @@ impl IndexScheduler {
                 zookeeper
                     .add_watch("/snapshots", AddWatchMode::PersistentRecursive, move |event| {
                         if !latchc.has_leadership() {
-                            let inner = this.inner();
-
                             let WatchedEvent { event_type, path, keeper_state: _ } = event;
                             match event_type {
                                 WatchedEventType::NodeCreated => {
@@ -553,24 +403,32 @@ impl IndexScheduler {
                                         snapshot_id
                                     ));
 
+                                    let inner = this.inner();
+
                                     // 1. TODO: Ensure the snapshot version file is the same as our version.
 
                                     // 2. Download all the databases
+                                    let base_path = {
+                                        let mut path = inner.env.path().to_path_buf();
+                                        assert!(path.pop());
+                                        path
+                                    };
+
                                     let tasks_file =
                                         tempfile::NamedTempFile::new_in(inner.env.path()).unwrap();
 
                                     log::info!("Downloading the index scheduler database.");
                                     let tasks_snapshot = snapshot_dir.join("tasks.mdb");
-                                    std::fs::copy(tasks_snapshot, tasks_file).unwrap();
+                                    std::fs::copy(&tasks_snapshot, tasks_file).unwrap();
 
                                     log::info!("Downloading the indexes databases");
                                     let indexes_files =
                                         tempfile::TempDir::new_in(&inner.index_mapper.base_path)
                                             .unwrap();
-                                    let mut indexes = Vec::new();
 
-                                    let dst = snapshot_dir.join("indexes");
-                                    for result in std::fs::read_dir(&dst).unwrap() {
+                                    let mut indexes = Vec::new();
+                                    let src = snapshot_dir.join("indexes");
+                                    for result in std::fs::read_dir(&src).unwrap() {
                                         let entry = result.unwrap();
                                         let uuid = entry
                                             .file_name()
@@ -580,7 +438,7 @@ impl IndexScheduler {
                                             .to_string();
                                         log::info!("\tDownloading the index {}", uuid.to_string());
                                         std::fs::copy(
-                                            dst.join(&uuid),
+                                            src.join(&uuid),
                                             indexes_files.path().join(&uuid),
                                         )
                                         .unwrap();
@@ -588,7 +446,28 @@ impl IndexScheduler {
                                     }
 
                                     // 3. Lock the index-mapper and close all the env
-                                    // TODO: continue here
+                                    drop(inner);
+                                    let mut lock = this.inner.write();
+                                    let mut raw_inner = lock.take().unwrap();
+
+                                    // Wait for others to finish what they started
+                                    let indexes = raw_inner.index_mapper.clear();
+                                    let mut pfcs: Vec<_> = indexes
+                                        .into_iter()
+                                        .map(|index| index.prepare_for_closing())
+                                        .collect();
+                                    pfcs.push(raw_inner.env.prepare_for_closing());
+                                    pfcs.into_iter().for_each(|pfc| pfc.wait());
+
+                                    // Let's replace all the folders/files.
+                                    std::fs::rename(&tasks_snapshot, base_path.join("tasks"))
+                                        .unwrap();
+                                    std::fs::rename(indexes_files, base_path.join("indexes"))
+                                        .unwrap();
+
+                                    // let inner = IndexSchedulerInner::new();
+
+                                    *lock = Some(todo!());
 
                                     // run.env.close();
 
@@ -1126,6 +1005,168 @@ pub struct IndexSchedulerInner {
 }
 
 impl IndexSchedulerInner {
+    fn new(
+        options: IndexSchedulerOptions,
+        #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>,
+        #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>,
+    ) -> Result<Self> {
+        std::fs::create_dir_all(&options.tasks_path)?;
+        std::fs::create_dir_all(&options.update_file_path)?;
+        std::fs::create_dir_all(&options.indexes_path)?;
+        std::fs::create_dir_all(&options.dumps_path)?;
+
+        if cfg!(windows) && options.enable_mdb_writemap {
+            // programmer error if this happens: in normal use passing the option on Windows is an error in main
+            panic!("Windows doesn't support the MDB_WRITEMAP LMDB option");
+        }
+
+        let task_db_size = clamp_to_page_size(options.task_db_size);
+        let budget = if options.indexer_config.skip_index_budget {
+            IndexBudget {
+                map_size: options.index_base_map_size,
+                index_count: options.index_count,
+                task_db_size,
+            }
+        } else {
+            Self::index_budget(
+                &options.tasks_path,
+                options.index_base_map_size,
+                task_db_size,
+                options.index_count,
+            )
+        };
+
+        let env = heed::EnvOpenOptions::new()
+            .max_dbs(11)
+            .map_size(budget.task_db_size)
+            .open(options.tasks_path)?;
+
+        let features = features::FeatureData::new(&env, options.instance_features)?;
+
+        let file_store = FileStore::new(&options.update_file_path)?;
+
+        let mut wtxn = env.write_txn()?;
+        let all_tasks = env.create_database(&mut wtxn, Some(db_name::ALL_TASKS))?;
+        let status = env.create_database(&mut wtxn, Some(db_name::STATUS))?;
+        let kind = env.create_database(&mut wtxn, Some(db_name::KIND))?;
+        let index_tasks = env.create_database(&mut wtxn, Some(db_name::INDEX_TASKS))?;
+        let canceled_by = env.create_database(&mut wtxn, Some(db_name::CANCELED_BY))?;
+        let enqueued_at = env.create_database(&mut wtxn, Some(db_name::ENQUEUED_AT))?;
+        let started_at = env.create_database(&mut wtxn, Some(db_name::STARTED_AT))?;
+        let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
+        wtxn.commit()?;
+
+        // allow unreachable_code to get rids of the warning in the case of a test build.
+        Ok(IndexSchedulerInner {
+            must_stop_processing: MustStopProcessing::default(),
+            processing_tasks: Arc::new(RwLock::new(ProcessingTasks::new())),
+            file_store,
+            all_tasks,
+            status,
+            kind,
+            index_tasks,
+            canceled_by,
+            enqueued_at,
+            started_at,
+            finished_at,
+            index_mapper: IndexMapper::new(
+                &env,
+                options.indexes_path,
+                budget.map_size,
+                options.index_growth_amount,
+                budget.index_count,
+                options.enable_mdb_writemap,
+                options.indexer_config,
+            )?,
+            env,
+            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
+            wake_up: Arc::new(SignalEvent::auto(true)),
+            autobatching_enabled: options.autobatching_enabled,
+            max_number_of_tasks: options.max_number_of_tasks,
+            dumps_path: options.dumps_path,
+            snapshots_path: options.snapshots_path,
+            auth_path: options.auth_path,
+            version_file_path: options.version_file_path,
+            zookeeper: options.zookeeper,
+            #[cfg(test)]
+            test_breakpoint_sdr,
+            #[cfg(test)]
+            planned_failures,
+            #[cfg(test)]
+            run_loop_iteration: Arc::new(RwLock::new(0)),
+            features,
+        })
+    }
+
+    fn index_budget(
+        tasks_path: &Path,
+        base_map_size: usize,
+        mut task_db_size: usize,
+        max_index_count: usize,
+    ) -> IndexBudget {
+        #[cfg(windows)]
+        const DEFAULT_BUDGET: usize = 6 * 1024 * 1024 * 1024 * 1024; // 6 TiB, 1 index
+        #[cfg(not(windows))]
+        const DEFAULT_BUDGET: usize = 80 * 1024 * 1024 * 1024 * 1024; // 80 TiB, 18 indexes
+
+        let budget = if Self::is_good_heed(tasks_path, DEFAULT_BUDGET) {
+            DEFAULT_BUDGET
+        } else {
+            log::debug!("determining budget with dichotomic search");
+            utils::dichotomic_search(DEFAULT_BUDGET / 2, |map_size| {
+                Self::is_good_heed(tasks_path, map_size)
+            })
+        };
+
+        log::debug!("memmap budget: {budget}B");
+        let mut budget = budget / 2;
+        if task_db_size > (budget / 2) {
+            task_db_size = clamp_to_page_size(budget * 2 / 5);
+            log::debug!(
+                "Decreasing max size of task DB to {task_db_size}B due to constrained memory space"
+            );
+        }
+        budget -= task_db_size;
+
+        // won't be mutated again
+        let budget = budget;
+        let task_db_size = task_db_size;
+
+        log::debug!("index budget: {budget}B");
+        let mut index_count = budget / base_map_size;
+        if index_count < 2 {
+            // take a bit less than half than the budget to make sure we can always afford to open an index
+            let map_size = (budget * 2) / 5;
+            // single index of max budget
+            log::debug!("1 index of {map_size}B can be opened simultaneously.");
+            return IndexBudget { map_size, index_count: 1, task_db_size };
+        }
+        // give us some space for an additional index when the cache is already full
+        // decrement is OK because index_count >= 2.
+        index_count -= 1;
+        if index_count > max_index_count {
+            index_count = max_index_count;
+        }
+        log::debug!("Up to {index_count} indexes of {base_map_size}B opened simultaneously.");
+        IndexBudget { map_size: base_map_size, index_count, task_db_size }
+    }
+
+    fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool {
+        if let Ok(env) =
+            heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path)
+        {
+            env.prepare_for_closing().wait();
+            true
+        } else {
+            // We're treating all errors equally here, not only allocation errors.
+            // This means there's a possiblity for the budget to lower due to errors different from allocation errors.
+            // For persistent errors, this is OK as long as the task db is then reopened normally without ignoring the error this time.
+            // For transient errors, this could lead to an instance with too low a budget.
+            // However transient errors are: 1) less likely than persistent errors 2) likely to cause other issues down the line anyway.
+            false
+        }
+    }
+
     pub fn read_txn(&self) -> Result<RoTxn> {
         self.env.read_txn().map_err(|e| e.into())
     }
diff --git a/index-scheduler/src/lru.rs b/index-scheduler/src/lru.rs
index 370ff5fe1..0ea321d4b 100644
--- a/index-scheduler/src/lru.rs
+++ b/index-scheduler/src/lru.rs
@@ -1,5 +1,6 @@
 //! Thread-safe `Vec`-backend LRU cache using [`std::sync::atomic::AtomicU64`] for synchronization.
 
+use std::mem;
 use std::sync::atomic::{AtomicU64, Ordering};
 
 /// Thread-safe `Vec`-backend LRU cache
@@ -190,6 +191,11 @@ where
         }
         None
     }
+
+    /// Returns the generation associated to the key and values of the `LruMap`.
+    pub fn clear(&mut self) -> Vec<(AtomicU64, (K, V))> {
+        mem::take(&mut self.0.data)
+    }
 }
 
 /// The result of an insertion in a LRU map.