diff --git a/meilitool/src/upgrade/mod.rs b/meilitool/src/upgrade/mod.rs deleted file mode 100644 index ae095b6bd..000000000 --- a/meilitool/src/upgrade/mod.rs +++ /dev/null @@ -1,73 +0,0 @@ -mod v1_10; -mod v1_11; -mod v1_9; - -use std::path::{Path, PathBuf}; - -use anyhow::{bail, Context}; -use meilisearch_types::versioning::create_version_file; - -use v1_10::v1_9_to_v1_10; - -use crate::upgrade::v1_11::v1_10_to_v1_11; - -pub struct OfflineUpgrade { - pub db_path: PathBuf, - pub current_version: (String, String, String), - pub target_version: (String, String, String), -} - -impl OfflineUpgrade { - pub fn upgrade(self) -> anyhow::Result<()> { - let upgrade_list = [ - (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), - (v1_10_to_v1_11, "1", "11", "0"), - ]; - - let (current_major, current_minor, current_patch) = &self.current_version; - - let start_at = match ( - current_major.as_str(), - current_minor.as_str(), - current_patch.as_str(), - ) { - ("1", "9", _) => 0, - ("1", "10", _) => 1, - _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") - } - }; - - let (target_major, target_minor, target_patch) = &self.target_version; - - let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("1", "10", _) => 0, - ("1", "11", _) => 1, - (major, _, _) if major.starts_with('v') => { - bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") - } - _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") - } - }; - - println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); - - #[allow(clippy::needless_range_loop)] - for index in start_at..=ends_at { - let (func, major, minor, patch) = upgrade_list[index]; - (func)(&self.db_path)?; - println!("Done"); - // We're writing the version file just in case an issue arise _while_ upgrading. - // We don't want the DB to fail in an unknown state. - println!("Writing VERSION file"); - - create_version_file(&self.db_path, major, minor, patch) - .context("while writing VERSION file after the upgrade")?; - } - - println!("Success"); - - Ok(()) - } -} diff --git a/meilitool/src/upgrade/v1_10.rs b/meilitool/src/upgrade/v1_10.rs deleted file mode 100644 index 671f4d6d2..000000000 --- a/meilitool/src/upgrade/v1_10.rs +++ /dev/null @@ -1,289 +0,0 @@ -use anyhow::bail; -use std::path::Path; - -use anyhow::Context; -use meilisearch_types::{ - heed::{ - types::{SerdeJson, Str}, - Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, - }, - milli::index::{db_name, main_key}, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; - -use super::v1_9; - -pub type FieldDistribution = std::collections::BTreeMap; - -/// The statistics that can be computed from an `Index` object. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - #[serde(with = "time::serde::rfc3339")] - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - #[serde(with = "time::serde::rfc3339")] - pub updated_at: time::OffsetDateTime, -} - -impl From for IndexStats { - fn from( - v1_9::IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - }: v1_9::IndexStats, - ) -> Self { - IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at: created_at.0, - updated_at: updated_at.0, - } - } -} - -#[derive(serde::Serialize, serde::Deserialize)] -#[serde(transparent)] -pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); - -fn update_index_stats( - index_stats: Database, - index_uid: &str, - index_uuid: uuid::Uuid, - sched_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let ctx = || format!("while updating index stats for index `{index_uid}`"); - - let stats: Option<&str> = index_stats - .remap_data_type::() - .get(sched_wtxn, &index_uuid) - .with_context(ctx) - .with_context(|| "While reading value")?; - dbg!(stats); - - let stats: Option = index_stats - .remap_data_type::>() - .get(sched_wtxn, &index_uuid) - .with_context(ctx) - .with_context(|| "While reading value")?; - - if let Some(stats) = stats { - let stats: self::IndexStats = stats.into(); - - index_stats - .remap_data_type::>() - .put(sched_wtxn, &index_uuid, &stats) - .with_context(ctx) - .with_context(|| "While writing value")?; - } - - Ok(()) -} - -fn update_date_format( - index_uid: &str, - index_env: &Env, - index_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) - .with_context(|| format!("while updating date format for index `{index_uid}`"))?; - - date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; - date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; - - Ok(()) -} - -fn find_rest_embedders( - index_uid: &str, - index_env: &Env, - index_txn: &RoTxn, -) -> anyhow::Result> { - let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) - .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; - - let mut rest_embedders = vec![]; - - for config in main - .remap_types::>>() - .get(index_txn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default() - { - if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { - rest_embedders.push(config.name); - } - } - - Ok(rest_embedders) -} - -fn date_round_trip( - wtxn: &mut RwTxn, - index_uid: &str, - db: Database, - key: &str, -) -> anyhow::Result<()> { - let datetime = - db.remap_types::>().get(wtxn, key).with_context(|| { - format!("could not read `{key}` while updating date format for index `{index_uid}`") - })?; - - if let Some(datetime) = datetime { - db.remap_types::>() - .put(wtxn, key, &self::OffsetDateTime(datetime.0)) - .with_context(|| { - format!( - "could not write `{key}` while updating date format for index `{index_uid}`" - ) - })?; - } - - Ok(()) -} - -pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { - println!("Upgrading from v1.9.0 to v1.10.0"); - // 2 changes here - - // 1. date format. needs to be done before opening the Index - // 2. REST embedders. We don't support this case right now, so bail - - let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; - - let mut sched_wtxn = env.write_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_wtxn, "index-mapping")?; - - let index_stats: Database = - try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let index_count = - index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; - - // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn - // 1. immutably for the iteration - // 2. mutably for updating index stats - let indexes: Vec<_> = index_mapping - .iter(&sched_wtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - let mut rest_embedders = Vec::new(); - - let mut unwrapped_indexes = Vec::new(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_txn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - println!("\t- Checking for incompatible embedders (REST embedders)"); - let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; - - if rest_embedders_for_index.is_empty() { - unwrapped_indexes.push((uid, uuid)); - } else { - // no need to add to unwrapped indexes because we'll exit early - rest_embedders.push((uid, rest_embedders_for_index)); - } - } - - if !rest_embedders.is_empty() { - let rest_embedders = rest_embedders - .into_iter() - .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) - .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) - .collect::>() - .join("\n"); - bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ - The database has not been modified and is still a valid v1.9 database."); - } - - println!("Update can take place, updating"); - - for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Updating index `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index `{uid}` at `{}`", - index_path.display() - ) - })?; - - println!("\t- Updating index stats"); - update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; - println!("\t- Updating date format"); - update_date_format(&uid, &index_env, &mut index_wtxn)?; - - index_wtxn.commit().with_context(|| { - format!("while committing the write txn for index `{uid}` at {}", index_path.display()) - })?; - } - - sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; - - println!("Upgrading database succeeded"); - - Ok(()) -} diff --git a/meilitool/src/upgrade/v1_11.rs b/meilitool/src/upgrade/v1_11.rs deleted file mode 100644 index 4105879fd..000000000 --- a/meilitool/src/upgrade/v1_11.rs +++ /dev/null @@ -1,86 +0,0 @@ -//! The breaking changes that happened between the v1.10 and the v1.11 are: -//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0 -//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata. -//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB. -//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything. - -use std::path::Path; - -use anyhow::Context; -use meilisearch_types::{ - heed::{types::Str, Database, EnvOpenOptions}, - milli::index::db_name, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; - -pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { - println!("Upgrading from v1.10.0 to v1.11.0"); - - let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; - - let sched_rtxn = env.read_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_rtxn, "index-mapping")?; - - let index_count = - index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?; - - let indexes: Vec<_> = index_mapping - .iter(&sched_rtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_rtxn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a read transaction for index {uid} at {}", - index_path.display() - ) - })?; - let index_read_database = - try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) - .with_context(|| format!("while updating date format for index `{uid}`"))?; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - let index_write_database = - try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) - .with_context(|| format!("while updating date format for index `{uid}`"))?; - - arroy_v04_to_v05::ugrade_from_prev_version( - &index_rtxn, - index_read_database, - &mut index_wtxn, - index_write_database, - )?; - - index_wtxn.commit()?; - } - - Ok(()) -} diff --git a/meilitool/src/upgrade/v1_9.rs b/meilitool/src/upgrade/v1_9.rs deleted file mode 100644 index 3e6cfde6c..000000000 --- a/meilitool/src/upgrade/v1_9.rs +++ /dev/null @@ -1,106 +0,0 @@ -use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -pub type FieldDistribution = std::collections::BTreeMap; - -/// The statistics that can be computed from an `Index` object. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - pub created_at: LegacyTime, - /// Date of the last update of the index. - pub updated_at: LegacyTime, -} - -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, -} - -#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, -} - -/// Options of an embedder, specific to each kind of embedder. -#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), -} - -impl Default for EmbedderOptions { - fn default() -> Self { - Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) - } -} - -mod hf { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub model: String, - pub revision: Option, - } -} -mod openai { - - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - } -} -mod ollama { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub embedding_model: String, - pub url: Option, - pub api_key: Option, - } -} -mod manual { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub dimensions: usize, - } -} -mod rest { - #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - pub url: String, - pub input_field: Vec, - // path to the array of embeddings - pub path_to_embeddings: Vec, - // shape of a single embedding - pub embedding_object: Vec, - } -} - -// 2024-11-04 13:32:08.48368 +00:00:00 -time::serde::format_description!(legacy_datetime, OffsetDateTime, "[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); - -#[derive(Debug, serde::Serialize, serde::Deserialize)] -#[serde(transparent)] -pub struct LegacyTime(#[serde(with = "legacy_datetime")] pub OffsetDateTime);