diff --git a/Cargo.lock b/Cargo.lock index 4886dc028..480dc782e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -503,7 +503,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2 [[package]] name = "benchmarks" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "bumpalo", @@ -694,7 +694,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "time", @@ -1671,7 +1671,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "big_s", @@ -1873,7 +1873,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.13.0" +version = "1.13.1" dependencies = [ "tempfile", "thiserror 2.0.9", @@ -1895,7 +1895,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.13.0" +version = "1.13.1" dependencies = [ "insta", "nom", @@ -1915,7 +1915,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.13.0" +version = "1.13.1" dependencies = [ "criterion", "serde_json", @@ -2054,7 +2054,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.13.0" +version = "1.13.1" dependencies = [ "arbitrary", "bumpalo", @@ -2743,7 +2743,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" [[package]] name = "index-scheduler" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2950,7 +2950,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.13.0" +version = "1.13.1" dependencies = [ "criterion", "serde_json", @@ -3569,7 +3569,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.13.0" +version = "1.13.1" dependencies = [ "insta", "md5", @@ -3578,7 +3578,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.13.0" +version = "1.13.1" dependencies = [ "actix-cors", "actix-http", @@ -3670,7 +3670,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.13.0" +version = "1.13.1" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3689,7 +3689,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.13.0" +version = "1.13.1" dependencies = [ "actix-web", "anyhow", @@ -3723,7 +3723,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)", @@ -3758,7 +3758,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.13.0" +version = "1.13.1" dependencies = [ "allocator-api2", "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -4270,7 +4270,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.13.0" +version = "1.13.1" dependencies = [ "big_s", "serde_json", @@ -6847,7 +6847,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.13.0" +version = "1.13.1" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index 8f7d87a87..ce1e119e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.13.0" +version = "1.13.1" authors = [ "Quentin de Quelen ", "Clément Renault ", diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 17d683bbb..48e29508f 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -6,6 +6,7 @@ use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli; +use meilisearch_types::milli::database_stats::DatabaseStats; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -98,8 +99,9 @@ pub enum IndexStatus { /// The statistics that can be computed from an `Index` object. #[derive(Serialize, Deserialize, Debug)] pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, + /// Stats of the documents database. + #[serde(default)] + pub documents_database_stats: DatabaseStats, /// Size taken up by the index' DB, in bytes. /// /// This includes the size taken by both the used and free pages of the DB, and as the free pages @@ -138,9 +140,9 @@ impl IndexStats { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { - number_of_documents: index.number_of_documents(rtxn)?, number_of_embeddings: Some(arroy_stats.number_of_embeddings), number_of_embedded_documents: Some(arroy_stats.documents.len()), + documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(), database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 6f1863876..bcc295afd 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -370,7 +370,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { let stats = mapper.stats_of(rtxn, &name).unwrap(); s.push_str(&format!( "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", - stats.number_of_documents, stats.field_distribution + stats.documents_database_stats.number_of_entries(), + stats.field_distribution )); } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap index 3ad2076c8..e7b50dfea 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -58,7 +57,7 @@ girafo: { number_of_documents: 0, field_distribution: {} } [timestamp] [4,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.1"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } 1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, } 2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } 3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap index eead6e773..1bd70062e 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap @@ -7,7 +7,7 @@ snapshot_kind: text [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap index 52f0b61a7..ece9ba67b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap @@ -7,7 +7,7 @@ snapshot_kind: text [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap index 96efafc9e..6414ed9be 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: @@ -38,7 +37,7 @@ catto [1,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.1"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap index bd223298d..1da68c7c9 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} ---------------------------------------------------------------------- @@ -41,7 +40,7 @@ doggo [2,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.1"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap index 5bb2d57cf..fbb38c597 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -44,7 +43,7 @@ doggo [2,3,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.1"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 44120ff64..84112de08 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -910,7 +910,11 @@ fn create_and_list_index() { [ "kefir", { - "number_of_documents": 0, + "documents_database_stats": { + "numberOfEntries": 0, + "totalKeySize": 0, + "totalValueSize": 0 + }, "database_size": "[bytes]", "number_of_embeddings": 0, "number_of_embedded_documents": 0, diff --git a/crates/index-scheduler/src/upgrade/mod.rs b/crates/index-scheduler/src/upgrade/mod.rs index 4e850aa32..cfc351b09 100644 --- a/crates/index-scheduler/src/upgrade/mod.rs +++ b/crates/index-scheduler/src/upgrade/mod.rs @@ -46,20 +46,19 @@ pub fn upgrade_index_scheduler( } }; - let mut current_version = from; - info!("Upgrading the task queue"); + let mut local_from = from; for upgrade in upgrade_functions[start..].iter() { let target = upgrade.target_version(); info!( "Upgrading from v{}.{}.{} to v{}.{}.{}", - from.0, from.1, from.2, current_version.0, current_version.1, current_version.2 + local_from.0, local_from.1, local_from.2, target.0, target.1, target.2 ); let mut wtxn = env.write_txn()?; - upgrade.upgrade(env, &mut wtxn, from)?; + upgrade.upgrade(env, &mut wtxn, local_from)?; versioning.set_version(&mut wtxn, target)?; wtxn.commit()?; - current_version = target; + local_from = target; } let mut wtxn = env.write_txn()?; diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 60af4dcba..a917ab00f 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -170,5 +170,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.16/build.zip" -sha1 = "68f83438a114aabbe76bc9fe480071e741996662" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.17/build.zip" +sha1 = "29e92ce25f306208a9c86f013279c736bdc1e034" diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 7ca8e407f..bbcb3674b 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -494,6 +494,10 @@ pub async fn delete_index( pub struct IndexStats { /// Number of documents in the index pub number_of_documents: u64, + /// Size of the documents database, in bytes. + pub raw_document_db_size: u64, + /// Average size of a document in the documents database. + pub avg_document_size: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, /// Number of embeddings in the index @@ -510,7 +514,9 @@ pub struct IndexStats { impl From for IndexStats { fn from(stats: index_scheduler::IndexStats) -> Self { IndexStats { - number_of_documents: stats.inner_stats.number_of_documents, + number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, @@ -532,6 +538,8 @@ impl From for IndexStats { (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( { "numberOfDocuments": 10, + "rawDocumentDbSize": 10, + "avgDocumentSize": 10, "numberOfEmbeddings": 10, "numberOfEmbeddedDocuments": 10, "isIndexing": true, diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 65a12b692..02cb4130a 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -392,6 +392,9 @@ pub struct Stats { "indexes": { "movies": { "numberOfDocuments": 10, + "rawDocumentDbSize": 100, + "maxDocumentSize": 16, + "avgDocumentSize": 10, "isIndexing": true, "fieldDistribution": { "genre": 10, diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 62cc51f29..4dfe2cc79 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -160,6 +160,8 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 4, + "rawDocumentDbSize": 42, + "avgDocumentSize": 10, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -209,6 +211,8 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 16, + "avgDocumentSize": 8, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -277,6 +281,8 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 12, + "avgDocumentSize": 12, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 55ee9dc93..d699be47a 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -32,6 +32,8 @@ async fn import_dump_v1_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -187,6 +189,8 @@ async fn import_dump_v1_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -355,6 +359,8 @@ async fn import_dump_v1_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -520,6 +526,8 @@ async fn import_dump_v2_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -675,6 +683,8 @@ async fn import_dump_v2_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -840,6 +850,8 @@ async fn import_dump_v2_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1002,6 +1014,8 @@ async fn import_dump_v3_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1157,6 +1171,8 @@ async fn import_dump_v3_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1322,6 +1338,8 @@ async fn import_dump_v3_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1484,6 +1502,8 @@ async fn import_dump_v4_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1639,6 +1659,8 @@ async fn import_dump_v4_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1804,6 +1826,8 @@ async fn import_dump_v4_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1973,6 +1997,8 @@ async fn import_dump_v5() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -2009,6 +2035,8 @@ async fn import_dump_v5() { @r###" { "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index a7ec35270..a5fa94eea 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -128,6 +128,40 @@ async fn search_with_stop_word() { .await; } +#[actix_rt::test] +async fn search_with_typo_settings() { + // related to https://github.com/meilisearch/meilisearch/issues/5240 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index + .update_settings(json!({"typoTolerance": { "disableOnAttributes": ["title", "id"]}})) + .await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + + index + .search(json!({"q": "287947" }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "id": "287947", + "color": [ + "green", + "blue" + ] + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index bb10d2cd5..20a8eaef6 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -113,6 +113,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -136,6 +138,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, @@ -159,6 +163,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 2, @@ -183,6 +189,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 1, @@ -231,6 +239,8 @@ async fn add_remove_embedded_documents() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -250,6 +260,8 @@ async fn add_remove_embedded_documents() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 13, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 1, @@ -281,6 +293,8 @@ async fn update_embedder_settings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -315,6 +329,8 @@ async fn update_embedder_settings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, diff --git a/crates/meilisearch/tests/upgrade/mod.rs b/crates/meilisearch/tests/upgrade/mod.rs index f26d99402..ca5cf0987 100644 --- a/crates/meilisearch/tests/upgrade/mod.rs +++ b/crates/meilisearch/tests/upgrade/mod.rs @@ -43,7 +43,7 @@ async fn version_too_old() { std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.13.0"); + snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.13.1"); } #[actix_rt::test] @@ -58,7 +58,7 @@ async fn version_requires_downgrade() { std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.13.1 is higher than the Meilisearch version 1.13.0. Downgrade is not supported"); + snapshot!(err, @"Database version 1.13.2 is higher than the Meilisearch version 1.13.1. Downgrade is not supported"); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index fcae53dba..9f41a3055 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index fcae53dba..9f41a3055 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index fcae53dba..9f41a3055 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index 102e21b73..790118967 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index 102e21b73..790118967 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index 102e21b73..790118967 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index b5ff80f3c..55891e133 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap index d965b9b68..665dc07fd 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap @@ -1,6 +1,5 @@ --- source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs -snapshot_kind: text --- { "results": [ @@ -13,7 +12,7 @@ snapshot_kind: text "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.13.0" + "upgradeTo": "v1.13.1" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index b7ea669a0..3e9f2b932 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -139,6 +139,8 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -225,6 +227,8 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -244,6 +248,8 @@ async fn check_the_index_scheduler(server: &Server) { snapshot!(stats, @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs new file mode 100644 index 000000000..d97dc13ba --- /dev/null +++ b/crates/milli/src/database_stats.rs @@ -0,0 +1,96 @@ +use heed::types::Bytes; +use heed::Database; +use heed::RoTxn; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// The stats of a database. +pub struct DatabaseStats { + /// The number of entries in the database. + number_of_entries: u64, + /// The total size of the keys in the database. + total_key_size: u64, + /// The total size of the values in the database. + total_value_size: u64, +} + +impl DatabaseStats { + /// Returns the stats of the database. + /// + /// This function iterates over the whole database and computes the stats. + /// It is not efficient and should be cached somewhere. + pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> heed::Result { + let mut database_stats = + Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; + + let mut iter = database.iter(rtxn)?; + while let Some((key, value)) = iter.next().transpose()? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + database_stats.total_key_size += key_size; + database_stats.total_value_size += value_size; + } + + database_stats.number_of_entries = database.len(rtxn)?; + + Ok(database_stats) + } + + /// Recomputes the stats of the database and returns the new stats. + /// + /// This function is used to update the stats of the database when some keys are modified. + /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. + pub(crate) fn recompute( + mut stats: Self, + database: Database, + before_rtxn: &RoTxn<'_>, + after_rtxn: &RoTxn<'_>, + modified_keys: I, + ) -> heed::Result + where + I: IntoIterator, + K: AsRef<[u8]>, + { + for key in modified_keys { + let key = key.as_ref(); + if let Some(value) = database.get(after_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_add(key_size); + stats.total_value_size = stats.total_value_size.saturating_add(value_size); + } + + if let Some(value) = database.get(before_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_sub(key_size); + stats.total_value_size = stats.total_value_size.saturating_sub(value_size); + } + } + + stats.number_of_entries = database.len(after_rtxn)?; + + Ok(stats) + } + + pub fn average_key_size(&self) -> u64 { + self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0) + } + + pub fn average_value_size(&self) -> u64 { + self.total_value_size.checked_div(self.number_of_entries).unwrap_or(0) + } + + pub fn number_of_entries(&self) -> u64 { + self.number_of_entries + } + + pub fn total_key_size(&self) -> u64 { + self.total_key_size + } + + pub fn total_value_size(&self) -> u64 { + self.total_value_size + } +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index df1baed3c..c748324ae 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -11,6 +11,7 @@ use rstar::RTree; use serde::{Deserialize, Serialize}; use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -74,6 +75,7 @@ pub mod main_key { pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; pub const FACET_SEARCH: &str = "facet_search"; pub const PREFIX_SEARCH: &str = "prefix_search"; + pub const DOCUMENTS_STATS: &str = "documents_stats"; } pub mod db_name { @@ -403,6 +405,58 @@ impl Index { Ok(count.unwrap_or_default()) } + /// Updates the stats of the documents database based on the previous stats and the modified docids. + pub fn update_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + modified_docids: roaring::RoaringBitmap, + ) -> Result<()> { + let before_rtxn = self.read_txn()?; + let document_stats = match self.documents_stats(&before_rtxn)? { + Some(before_stats) => DatabaseStats::recompute( + before_stats, + self.documents.remap_types(), + &before_rtxn, + wtxn, + modified_docids.iter().map(|docid| docid.to_be_bytes()), + )?, + None => { + // This should never happen when there are already documents in the index, the documents stats should be present. + // If it happens, it means that the index was not properly initialized/upgraded. + debug_assert_eq!( + self.documents.len(&before_rtxn)?, + 0, + "The documents stats should be present when there are documents in the index" + ); + tracing::warn!("No documents stats found, creating new ones"); + DatabaseStats::new(self.documents.remap_types(), &*wtxn)? + } + }; + + self.put_documents_stats(wtxn, document_stats)?; + Ok(()) + } + + /// Writes the stats of the documents database. + pub fn put_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + stats: DatabaseStats, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::DOCUMENTS_STATS, + &stats, + ) + } + + /// Returns the stats of the documents database. + pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + self.main + .remap_types::>() + .get(rtxn, main_key::DOCUMENTS_STATS) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index bb1532c1a..1d6d04fc7 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -10,6 +10,7 @@ pub mod documents; mod asc_desc; mod criterion; +pub mod database_stats; mod error; mod external_documents_ids; pub mod facet; diff --git a/crates/milli/src/search/new/query_term/compute_derivations.rs b/crates/milli/src/search/new/query_term/compute_derivations.rs index e2a136328..79cd830ca 100644 --- a/crates/milli/src/search/new/query_term/compute_derivations.rs +++ b/crates/milli/src/search/new/query_term/compute_derivations.rs @@ -215,7 +215,7 @@ pub fn partially_initialized_term_from_word( let mut zero_typo = None; let mut prefix_of = BTreeSet::new(); - if fst.contains(word) { + if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() { zero_typo = Some(word_interned); } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 56c26ed29..de3b1ee59 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -307,6 +307,7 @@ where let current_span = tracing::Span::current(); // Run extraction pipeline in parallel. + let mut modified_docids = RoaringBitmap::new(); pool.install(|| { let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { @@ -367,7 +368,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); @@ -467,6 +468,10 @@ where Ok(()) }).map_err(InternalError::from)??; + if !settings_diff.settings_update_only { + // Update the stats of the documents database when there is a document update. + self.index.update_documents_stats(self.wtxn, modified_docids)?; + } // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index d5c250e2d..0809d9601 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, settings_diff: &InnerIndexSettingsDiff, typed_chunks: Vec, + modified_docids: &mut RoaringBitmap, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { @@ -214,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Create, }); docids.insert(docid); + modified_docids.insert(docid); } else { db.delete(wtxn, &docid)?; operations.push(DocumentOperation { @@ -222,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Delete, }); docids.remove(docid); + modified_docids.insert(docid); } } let external_documents_docids = index.external_documents_ids(); diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 47bca6193..f9829032b 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -711,15 +711,17 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del, add } } - pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) { + pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) { let DelAddRoaringBitmap { del, add } = self; if let Some(del) = del { *documents_ids -= del; + *modified_docids |= del; } if let Some(add) = add { *documents_ids |= add; + *modified_docids |= add; } } } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 792b0c03b..f49cd834d 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -32,6 +32,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( field_distribution: &mut BTreeMap, mut index_embeddings: Vec, document_ids: &mut RoaringBitmap, + modified_docids: &mut RoaringBitmap, ) -> Result<(FacetFieldIdsDelta, Vec)> where DC: DocumentChanges<'pl>, @@ -70,7 +71,7 @@ where // adding the delta should never cause a negative result, as we are removing fields that previously existed. *current = current.saturating_add_signed(delta); } - document_extractor_data.docids_delta.apply_to(document_ids); + document_extractor_data.docids_delta.apply_to(document_ids, modified_docids); } field_distribution.retain(|_, v| *v != 0); @@ -256,7 +257,7 @@ where let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + deladd.apply_to(&mut config.user_provided, modified_docids); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 58c60d502..1cd227139 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -130,6 +130,7 @@ where let index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; + let mut modified_docids = roaring::RoaringBitmap::new(); let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); @@ -138,6 +139,7 @@ where // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; + let modified_docids = &mut modified_docids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.install(move || { @@ -152,6 +154,7 @@ where field_distribution, index_embeddings, document_ids, + modified_docids, ) }) .unwrap() @@ -227,6 +230,7 @@ where embedders, field_distribution, document_ids, + modified_docids, )?; Ok(congestion) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 1dad993f0..723e018a1 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -121,7 +121,8 @@ where Ok(()) } -pub fn update_index( +#[allow(clippy::too_many_arguments)] +pub(super) fn update_index( index: &Index, wtxn: &mut RwTxn<'_>, new_fields_ids_map: FieldIdMapWithMetadata, @@ -129,6 +130,7 @@ pub fn update_index( embedders: EmbeddingConfigs, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, + modified_docids: roaring::RoaringBitmap, ) -> Result<()> { index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { @@ -140,6 +142,7 @@ pub fn update_index( index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + index.update_documents_stats(wtxn, modified_docids)?; Ok(()) } diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 16f0eef7a..0ed67f2cb 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -3,7 +3,7 @@ mod v1_13; use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; -use v1_13::V1_13_0_To_Current; +use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Current}; use crate::progress::{Progress, VariableNameStep}; use crate::{Index, InternalError, Result}; @@ -28,13 +28,18 @@ pub fn upgrade( progress: Progress, ) -> Result { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = - &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; + let upgrade_functions: &[&dyn UpgradeIndex] = &[ + &V1_12_To_V1_12_3 {}, + &V1_12_3_To_V1_13_0 {}, + &V1_13_0_To_V1_13_1 {}, + &V1_13_1_To_Current {}, + ]; let start = match from { (1, 12, 0..=2) => 0, (1, 12, 3..) => 1, (1, 13, 0) => 2, + (1, 13, 1) => 3, // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 13, _) => return Ok(false), (major, minor, patch) => { diff --git a/crates/milli/src/update/upgrade/v1_13.rs b/crates/milli/src/update/upgrade/v1_13.rs index 52246a7f3..f1d56d9cb 100644 --- a/crates/milli/src/update/upgrade/v1_13.rs +++ b/crates/milli/src/update/upgrade/v1_13.rs @@ -2,13 +2,44 @@ use heed::RwTxn; use super::UpgradeIndex; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use crate::database_stats::DatabaseStats; use crate::progress::Progress; -use crate::{Index, Result}; +use crate::{make_enum_progress, Index, Result}; #[allow(non_camel_case_types)] -pub(super) struct V1_13_0_To_Current(); +pub(super) struct V1_13_0_To_V1_13_1(); -impl UpgradeIndex for V1_13_0_To_Current { +impl UpgradeIndex for V1_13_0_To_V1_13_1 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + progress: Progress, + ) -> Result { + make_enum_progress! { + enum DocumentsStats { + CreatingDocumentsStats, + } + }; + + // Create the new documents stats. + progress.update_progress(DocumentsStats::CreatingDocumentsStats); + let stats = DatabaseStats::new(index.documents.remap_types(), wtxn)?; + index.put_documents_stats(wtxn, stats)?; + + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 13, 1) + } +} + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_1_To_Current(); + +impl UpgradeIndex for V1_13_1_To_Current { fn upgrade( &self, _wtxn: &mut RwTxn,