Merge #5336

5336: Meilitool Hair Dryer r=dureuill a=Kerollmops This pull request introduces a new subcommand to hair dry a specific part of specific indexes. It is useful when [the memory-mapped pages are not hot in the cache](https://arc.net/l/quote/ixhcdwcq) and must be. Hair drying those interesting pages makes the search requests using the vector store much faster. The previous technique used the "cat method," which consists of reading the whole LMDB data file and pipping it into the null file descriptor. By doing that, the whole LMDB data file becomes hot in the cache. However, when the database is large, at least 30% of it is free, and unused pages and many other pages don't need to be hot, e.g., raw JSON documents or uninteresting parts of the inverted index. This new subcommand reads all the Arroy pages of a given index to make them hot, and only those. More coming... The current algorithm is single-threaded and takes a lot of time. I am in the process of multithreading it. This is the time it takes to hair dry a 305GiB database with a single thread. ``` real 21m51.054s user 0m3.155s sys 0m19.393s ``` ## To Do - [ ] (optional) Do the reads in parallel. Co-authored-by: Kerollmops <clement@meilisearch.com>
2025-02-13 06:10:12 +08:00 · 2025-02-12 10:45:16 +00:00 · 2025-02-12 10:45:16 +00:00 · 81a38099ec
commit 81a38099ec
parent 70305b9f71 803a699b15
1 changed files with 88 additions and 2 deletions
--- a/crates/meilitool/src/main.rs
+++ b/crates/meilitool/src/main.rs
@ -4,12 +4,12 @@ use std::path::PathBuf;
 use std::time::Instant;

 use anyhow::{bail, Context};
-use clap::{Parser, Subcommand};
+use clap::{Parser, Subcommand, ValueEnum};
 use dump::{DumpWriter, IndexMetadata};
 use file_store::FileStore;
 use meilisearch_auth::AuthController;
 use meilisearch_types::batches::Batch;
-use meilisearch_types::heed::types::{SerdeJson, Str};
+use meilisearch_types::heed::types::{Bytes, SerdeJson, Str};
 use meilisearch_types::heed::{
    CompactionOption, Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
 };
@ -124,6 +124,25 @@ enum Command {
    /// the compaction operation can start. Once the compaction is done, the big index is replaced
    /// by the compacted one and the mutable transaction is released.
    CompactIndex { index_name: String },
+
+    /// Uses the hair dryer the dedicate pages hot in cache
+    ///
+    /// To make the index faster we must make sure it is hot in the DB cache that's the cure of
+    /// memory-mapping but also it's strengh. This command is designed to make a spcific part of
+    /// the index hot in cache.
+    HairDryer {
+        #[arg(long, value_delimiter = ',')]
+        index_name: Vec<String>,
+
+        #[arg(long, value_delimiter = ',')]
+        index_part: Vec<IndexPart>,
+    },
+}
+
+#[derive(Clone, ValueEnum)]
+enum IndexPart {
+    /// Will make the arroy index hot.
+    Arroy,
 }

 fn main() -> anyhow::Result<()> {
@ -144,6 +163,9 @@ fn main() -> anyhow::Result<()> {
            OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade()
        }
        Command::CompactIndex { index_name } => compact_index(db_path, &index_name),
+        Command::HairDryer { index_name, index_part } => {
+            hair_dryer(db_path, &index_name, &index_part)
+        }
    }
 }

@ -587,3 +609,67 @@ fn export_documents(

    Ok(())
 }
+
+fn hair_dryer(
+    db_path: PathBuf,
+    index_names: &[String],
+    index_parts: &[IndexPart],
+) -> anyhow::Result<()> {
+    let index_scheduler_path = db_path.join("tasks");
+    let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
+        .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
+
+    eprintln!("Trying to get a read transaction on the index scheduler...");
+
+    let rtxn = env.read_txn()?;
+    let index_mapping: Database<Str, UuidCodec> =
+        try_opening_database(&env, &rtxn, "index-mapping")?;
+
+    for result in index_mapping.iter(&rtxn)? {
+        let (uid, uuid) = result?;
+        if index_names.iter().any(|i| i == uid) {
+            let index_path = db_path.join("indexes").join(uuid.to_string());
+            let index =
+                Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| {
+                    format!("While trying to open the index at path {:?}", index_path.display())
+                })?;
+
+            eprintln!("Trying to get a read transaction on the {uid} index...");
+
+            let rtxn = index.read_txn()?;
+            for part in index_parts {
+                match part {
+                    IndexPart::Arroy => {
+                        let mut count = 0;
+                        let total = index.vector_arroy.len(&rtxn)?;
+                        eprintln!("Hair drying arroy for {uid}...");
+                        for (i, result) in index
+                            .vector_arroy
+                            .remap_types::<Bytes, Bytes>()
+                            .iter(&rtxn)?
+                            .enumerate()
+                        {
+                            let (key, value) = result?;
+
+                            // All of this just to avoid compiler optimizations 🤞
+                            // We must read all the bytes to make the pages hot in cache.
+                            // <https://doc.rust-lang.org/std/hint/fn.black_box.html>
+                            count += std::hint::black_box(key.iter().fold(0, |acc, _| acc + 1));
+                            count += std::hint::black_box(value.iter().fold(0, |acc, _| acc + 1));
+
+                            if i % 10_000 == 0 {
+                                let perc = (i as f64) / (total as f64) * 100.0;
+                                eprintln!("Visited {i}/{total} ({perc:.2}%) keys")
+                            }
+                        }
+                        eprintln!("Done hair drying a total of at least {count} bytes.");
+                    }
+                }
+            }
+        } else {
+            eprintln!("Found index {uid} but it's not the right index...");
+        }
+    }
+
+    Ok(())
+}