From 22bdec7e7479bffbce86ec9c0ceffe862c730883 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 10 Feb 2025 14:10:13 +0100 Subject: [PATCH 01/12] Add document database stats --- .../index-scheduler/src/index_mapper/mod.rs | 8 +- crates/index-scheduler/src/insta_snapshot.rs | 3 +- crates/index-scheduler/src/scheduler/test.rs | 10 +- crates/meilisearch/src/routes/indexes/mod.rs | 11 +- .../tests/documents/delete_documents.rs | 9 ++ crates/meilisearch/tests/dumps/mod.rs | 3 + crates/milli/src/database_stats.rs | 100 ++++++++++++++++++ crates/milli/src/index.rs | 6 ++ crates/milli/src/lib.rs | 1 + 9 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 crates/milli/src/database_stats.rs diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 17d683bbb..7b226ac01 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -6,6 +6,7 @@ use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli; +use meilisearch_types::milli::database_stats::DatabaseStats; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -98,8 +99,9 @@ pub enum IndexStatus { /// The statistics that can be computed from an `Index` object. #[derive(Serialize, Deserialize, Debug)] pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, + /// Stats of the documents database. + #[serde(default)] + pub documents_database_stats: DatabaseStats, /// Size taken up by the index' DB, in bytes. /// /// This includes the size taken by both the used and free pages of the DB, and as the free pages @@ -138,9 +140,9 @@ impl IndexStats { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { - number_of_documents: index.number_of_documents(rtxn)?, number_of_embeddings: Some(arroy_stats.number_of_embeddings), number_of_embedded_documents: Some(arroy_stats.documents.len()), + documents_database_stats: index.documents_database_stats(rtxn)?, database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bb8827fdc..261b71e69 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -365,7 +365,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { let stats = mapper.stats_of(rtxn, &name).unwrap(); s.push_str(&format!( "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", - stats.number_of_documents, stats.field_distribution + stats.documents_database_stats.number_of_entries(), + stats.field_distribution )); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 44120ff64..ddce7b2e0 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -910,7 +910,15 @@ fn create_and_list_index() { [ "kefir", { - "number_of_documents": 0, + "documents_database_stats": { + "numberOfEntries": 0, + "totalKeySize": 0, + "totalValueSize": 0, + "maxKeySize": 0, + "maxValueSize": 0, + "minKeySize": 0, + "minValueSize": 0 + }, "database_size": "[bytes]", "number_of_embeddings": 0, "number_of_embedded_documents": 0, diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 7ca8e407f..6ccdb8e71 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -494,6 +494,12 @@ pub async fn delete_index( pub struct IndexStats { /// Number of documents in the index pub number_of_documents: u64, + /// Size of the documents database, in bytes. + pub raw_document_db_size: u64, + /// Maximum size of a document in the documents database. + pub max_document_size: u64, + /// Average size of a document in the documents database. + pub avg_document_size: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, /// Number of embeddings in the index @@ -510,7 +516,10 @@ pub struct IndexStats { impl From for IndexStats { fn from(stats: index_scheduler::IndexStats) -> Self { IndexStats { - number_of_documents: stats.inner_stats.number_of_documents, + number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + max_document_size: stats.inner_stats.documents_database_stats.max_value_size(), + avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 62cc51f29..34a2c8325 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -160,6 +160,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 4, + "rawDocumentDbSize": 42, + "maxDocumentSize": 13, + "avgDocumentSize": 10, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -209,6 +212,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 16, + "maxDocumentSize": 12, + "avgDocumentSize": 8, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -277,6 +283,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 12, + "maxDocumentSize": 12, + "avgDocumentSize": 12, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 1b07afdfd..fc523b3d7 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -187,6 +187,9 @@ async fn import_dump_v1_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "maxDocumentSize": 743, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs new file mode 100644 index 000000000..a823bb26d --- /dev/null +++ b/crates/milli/src/database_stats.rs @@ -0,0 +1,100 @@ +use heed::types::Bytes; +use heed::Database; +use heed::RoTxn; +use serde::{Deserialize, Serialize}; + +use crate::Result; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// The stats of a database. +pub struct DatabaseStats { + /// The number of entries in the database. + number_of_entries: u64, + /// The total size of the keys in the database. + total_key_size: u64, + /// The total size of the values in the database. + total_value_size: u64, + /// The maximum size of a key in the database. + max_key_size: u64, + /// The maximum size of a value in the database. + max_value_size: u64, + /// The minimum size of a key in the database. + min_key_size: u64, + /// The minimum size of a value in the database. + min_value_size: u64, +} + +impl DatabaseStats { + /// Returns the stats of the database. + /// + /// This function iterates over the whole database and computes the stats. + /// It is not efficient and should be cached somewhere. + pub(crate) fn new<'a>(database: Database, rtxn: &RoTxn<'a>) -> Result { + let mut database_stats = Self { + number_of_entries: 0, + total_key_size: 0, + total_value_size: 0, + max_key_size: 0, + max_value_size: 0, + min_key_size: u64::MAX, + min_value_size: u64::MAX, + }; + + let mut iter = database.iter(rtxn)?; + while let Some((key, value)) = iter.next().transpose()? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + database_stats.number_of_entries += 1; + database_stats.total_key_size += key_size; + database_stats.total_value_size += value_size; + database_stats.max_key_size = database_stats.max_key_size.max(key_size); + database_stats.max_value_size = database_stats.max_value_size.max(value_size); + database_stats.min_key_size = database_stats.min_key_size.min(key_size); + database_stats.min_value_size = database_stats.min_value_size.min(value_size); + } + + if database_stats.number_of_entries == 0 { + database_stats.min_key_size = 0; + database_stats.min_value_size = 0; + } + + Ok(database_stats) + } + + pub fn average_key_size(&self) -> u64 { + self.total_key_size / self.number_of_entries + } + + pub fn average_value_size(&self) -> u64 { + self.total_value_size / self.number_of_entries + } + + pub fn number_of_entries(&self) -> u64 { + self.number_of_entries + } + + pub fn total_key_size(&self) -> u64 { + self.total_key_size + } + + pub fn total_value_size(&self) -> u64 { + self.total_value_size + } + + pub fn max_key_size(&self) -> u64 { + self.max_key_size + } + + pub fn max_value_size(&self) -> u64 { + self.max_value_size + } + + pub fn min_key_size(&self) -> u64 { + self.min_key_size + } + + pub fn min_value_size(&self) -> u64 { + self.min_value_size + } +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 0550965ed..6179e117b 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -11,6 +11,7 @@ use rstar::RTree; use serde::{Deserialize, Serialize}; use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -403,6 +404,11 @@ impl Index { Ok(count.unwrap_or_default()) } + /// Returns the stats of the database. + pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(DatabaseStats::new(self.documents.remap_types::(), rtxn)?) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index ea88d2b78..12b5fbc2e 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -10,6 +10,7 @@ pub mod documents; mod asc_desc; mod criterion; +pub mod database_stats; mod error; mod external_documents_ids; pub mod facet; From cd4ba395e4fa50e83a03ca8f2e74ae44fbf13f2d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 11 Feb 2025 10:01:22 +0100 Subject: [PATCH 02/12] fix snapshots --- crates/meilisearch/src/routes/mod.rs | 3 +++ crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 65a12b692..02cb4130a 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -392,6 +392,9 @@ pub struct Stats { "indexes": { "movies": { "numberOfDocuments": 10, + "rawDocumentDbSize": 100, + "maxDocumentSize": 16, + "avgDocumentSize": 10, "isIndexing": true, "fieldDistribution": { "genre": 10, diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 6aab2861a..78ad9f7e4 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -134,6 +134,9 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "maxDocumentSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -216,6 +219,9 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "maxDocumentSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -235,6 +241,9 @@ async fn check_the_index_scheduler(server: &Server) { snapshot!(stats, @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "maxDocumentSize": 109, + "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, From fa27327db5bb366fa34cf3050514ff0d2e061cc9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 11 Feb 2025 10:15:31 +0100 Subject: [PATCH 03/12] fix clippy --- crates/milli/src/database_stats.rs | 2 +- crates/milli/src/index.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index a823bb26d..099687965 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -30,7 +30,7 @@ impl DatabaseStats { /// /// This function iterates over the whole database and computes the stats. /// It is not efficient and should be cached somewhere. - pub(crate) fn new<'a>(database: Database, rtxn: &RoTxn<'a>) -> Result { + pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> Result { let mut database_stats = Self { number_of_entries: 0, total_key_size: 0, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 6179e117b..11e0ff1f9 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -406,7 +406,7 @@ impl Index { /// Returns the stats of the database. pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result { - Ok(DatabaseStats::new(self.documents.remap_types::(), rtxn)?) + DatabaseStats::new(self.documents.remap_types::(), rtxn) } /* primary key */ From be676f9977756010a5128803d14b6e0dcbdbc1bf Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 11 Feb 2025 12:01:29 +0100 Subject: [PATCH 04/12] Fix zero division --- crates/milli/src/database_stats.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index 099687965..ddf5827b6 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -63,11 +63,19 @@ impl DatabaseStats { } pub fn average_key_size(&self) -> u64 { - self.total_key_size / self.number_of_entries + if self.total_key_size == 0 { + 0 + } else { + self.total_key_size / self.number_of_entries + } } pub fn average_value_size(&self) -> u64 { - self.total_value_size / self.number_of_entries + if self.total_value_size == 0 { + 0 + } else { + self.total_value_size / self.number_of_entries + } } pub fn number_of_entries(&self) -> u64 { From 1bd57a9a9415efb87d586ccea164716a09234f29 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 11 Feb 2025 17:24:44 +0100 Subject: [PATCH 05/12] Use checked_div in average computation --- crates/milli/src/database_stats.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index ddf5827b6..c15280b78 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -63,19 +63,11 @@ impl DatabaseStats { } pub fn average_key_size(&self) -> u64 { - if self.total_key_size == 0 { - 0 - } else { - self.total_key_size / self.number_of_entries - } + self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0) } pub fn average_value_size(&self) -> u64 { - if self.total_value_size == 0 { - 0 - } else { - self.total_value_size / self.number_of_entries - } + self.total_value_size.checked_div(self.number_of_entries).unwrap_or(0) } pub fn number_of_entries(&self) -> u64 { From 9a336283315440997f6919212e51a7ccb338f1ae Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Feb 2025 16:36:33 +0100 Subject: [PATCH 06/12] Implement Incremental document database stats computing --- .../index-scheduler/src/index_mapper/mod.rs | 2 +- crates/milli/src/database_stats.rs | 86 +++++++++---------- crates/milli/src/index.rs | 56 +++++++++++- .../milli/src/update/index_documents/mod.rs | 7 +- .../src/update/index_documents/typed_chunk.rs | 3 + crates/milli/src/update/new/extract/cache.rs | 4 +- .../milli/src/update/new/indexer/extract.rs | 5 +- crates/milli/src/update/new/indexer/mod.rs | 4 + crates/milli/src/update/new/indexer/write.rs | 2 + 9 files changed, 116 insertions(+), 53 deletions(-) diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 7b226ac01..48e29508f 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -142,7 +142,7 @@ impl IndexStats { Ok(IndexStats { number_of_embeddings: Some(arroy_stats.number_of_embeddings), number_of_embedded_documents: Some(arroy_stats.documents.len()), - documents_database_stats: index.documents_database_stats(rtxn)?, + documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(), database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index c15280b78..cd7adab4d 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -3,8 +3,6 @@ use heed::Database; use heed::RoTxn; use serde::{Deserialize, Serialize}; -use crate::Result; - #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] #[serde(rename_all = "camelCase")] /// The stats of a database. @@ -15,14 +13,6 @@ pub struct DatabaseStats { total_key_size: u64, /// The total size of the values in the database. total_value_size: u64, - /// The maximum size of a key in the database. - max_key_size: u64, - /// The maximum size of a value in the database. - max_value_size: u64, - /// The minimum size of a key in the database. - min_key_size: u64, - /// The minimum size of a value in the database. - min_value_size: u64, } impl DatabaseStats { @@ -30,38 +20,60 @@ impl DatabaseStats { /// /// This function iterates over the whole database and computes the stats. /// It is not efficient and should be cached somewhere. - pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> Result { - let mut database_stats = Self { - number_of_entries: 0, - total_key_size: 0, - total_value_size: 0, - max_key_size: 0, - max_value_size: 0, - min_key_size: u64::MAX, - min_value_size: u64::MAX, - }; + pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> heed::Result { + let mut database_stats = + Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; let mut iter = database.iter(rtxn)?; while let Some((key, value)) = iter.next().transpose()? { let key_size = key.len() as u64; let value_size = value.len() as u64; - database_stats.number_of_entries += 1; database_stats.total_key_size += key_size; database_stats.total_value_size += value_size; - database_stats.max_key_size = database_stats.max_key_size.max(key_size); - database_stats.max_value_size = database_stats.max_value_size.max(value_size); - database_stats.min_key_size = database_stats.min_key_size.min(key_size); - database_stats.min_value_size = database_stats.min_value_size.min(value_size); } - if database_stats.number_of_entries == 0 { - database_stats.min_key_size = 0; - database_stats.min_value_size = 0; - } + database_stats.number_of_entries = database.len(rtxn)?; Ok(database_stats) } + /// Recomputes the stats of the database and returns the new stats. + /// + /// This function is used to update the stats of the database when some keys are modified. + /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. + pub(crate) fn recompute<'a, I, K>( + mut stats: Self, + database: Database, + before_rtxn: &RoTxn<'_>, + after_rtxn: &RoTxn<'_>, + modified_keys: I, + ) -> heed::Result + where + I: IntoIterator, + K: AsRef<[u8]>, + { + for key in modified_keys { + let key = key.as_ref(); + if let Some(value) = database.get(after_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_add(key_size); + stats.total_value_size = stats.total_value_size.saturating_add(value_size); + } + + if let Some(value) = database.get(before_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_sub(key_size); + stats.total_value_size = stats.total_value_size.saturating_sub(value_size); + } + } + + stats.number_of_entries = database.len(after_rtxn)?; + + Ok(stats) + } + pub fn average_key_size(&self) -> u64 { self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0) } @@ -81,20 +93,4 @@ impl DatabaseStats { pub fn total_value_size(&self) -> u64 { self.total_value_size } - - pub fn max_key_size(&self) -> u64 { - self.max_key_size - } - - pub fn max_value_size(&self) -> u64 { - self.max_value_size - } - - pub fn min_key_size(&self) -> u64 { - self.min_key_size - } - - pub fn min_value_size(&self) -> u64 { - self.min_value_size - } } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 11e0ff1f9..4d1e7a2b9 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -75,6 +75,7 @@ pub mod main_key { pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; pub const FACET_SEARCH: &str = "facet_search"; pub const PREFIX_SEARCH: &str = "prefix_search"; + pub const DOCUMENTS_STATS: &str = "documents_stats"; } pub mod db_name { @@ -404,9 +405,58 @@ impl Index { Ok(count.unwrap_or_default()) } - /// Returns the stats of the database. - pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result { - DatabaseStats::new(self.documents.remap_types::(), rtxn) + /// Updates the stats of the documents database based on the previous stats and the modified docids. + pub fn update_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + modified_docids: roaring::RoaringBitmap, + ) -> Result<()> { + let before_rtxn = self.read_txn()?; + let document_stats = match self.documents_stats(&before_rtxn)? { + Some(before_stats) => DatabaseStats::recompute( + before_stats, + self.documents.remap_types(), + &before_rtxn, + wtxn, + modified_docids.iter().map(|docid| docid.to_be_bytes()), + )?, + None => { + // This should never happen when there are already documents in the index, the documents stats should be present. + // If it happens, it means that the index was not properly initialized/upgraded. + debug_assert_eq!( + self.documents.len(&before_rtxn)?, + 0, + "The documents stats should be present when there are documents in the index" + ); + tracing::warn!("No documents stats found, creating new ones"); + DatabaseStats::new(self.documents.remap_types(), &*wtxn)? + } + }; + + self.put_documents_stats(wtxn, document_stats)?; + Ok(()) + } + + /// Writes the stats of the documents database. + pub fn put_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + stats: DatabaseStats, + ) -> heed::Result<()> { + eprintln!("putting documents stats: {:?}", stats); + self.main.remap_types::>().put( + wtxn, + main_key::DOCUMENTS_STATS, + &stats, + ) + } + + /// Returns the stats of the documents database. + pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + dbg!(self + .main + .remap_types::>() + .get(rtxn, main_key::DOCUMENTS_STATS)) } /* primary key */ diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 154db7875..4cb44c91b 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -307,6 +307,7 @@ where let current_span = tracing::Span::current(); // Run extraction pipeline in parallel. + let mut modified_docids = RoaringBitmap::new(); pool.install(|| { let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { @@ -367,7 +368,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); @@ -467,6 +468,10 @@ where Ok(()) }).map_err(InternalError::from)??; + if !settings_diff.settings_update_only { + // Update the stats of the documents database when there is a document update. + self.index.update_documents_stats(self.wtxn, modified_docids)?; + } // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index d5c250e2d..0809d9601 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, settings_diff: &InnerIndexSettingsDiff, typed_chunks: Vec, + modified_docids: &mut RoaringBitmap, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { @@ -214,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Create, }); docids.insert(docid); + modified_docids.insert(docid); } else { db.delete(wtxn, &docid)?; operations.push(DocumentOperation { @@ -222,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Delete, }); docids.remove(docid); + modified_docids.insert(docid); } } let external_documents_docids = index.external_documents_ids(); diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 47bca6193..f9829032b 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -711,15 +711,17 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del, add } } - pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) { + pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) { let DelAddRoaringBitmap { del, add } = self; if let Some(del) = del { *documents_ids -= del; + *modified_docids |= del; } if let Some(add) = add { *documents_ids |= add; + *modified_docids |= add; } } } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 53478f029..3299d610f 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -32,6 +32,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( field_distribution: &mut BTreeMap, mut index_embeddings: Vec, document_ids: &mut RoaringBitmap, + modified_docids: &mut RoaringBitmap, ) -> Result<(FacetFieldIdsDelta, Vec)> where DC: DocumentChanges<'pl>, @@ -70,7 +71,7 @@ where // adding the delta should never cause a negative result, as we are removing fields that previously existed. *current = current.saturating_add_signed(delta); } - document_extractor_data.docids_delta.apply_to(document_ids); + document_extractor_data.docids_delta.apply_to(document_ids, modified_docids); } field_distribution.retain(|_, v| *v != 0); @@ -256,7 +257,7 @@ where let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + deladd.apply_to(&mut config.user_provided, modified_docids); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 890191323..9717b358b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -129,6 +129,7 @@ where let index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; + let mut modified_docids = roaring::RoaringBitmap::new(); thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); @@ -137,6 +138,7 @@ where // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; + let modified_docids = &mut modified_docids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.install(move || { @@ -151,6 +153,7 @@ where field_distribution, index_embeddings, document_ids, + modified_docids, ) }) .unwrap() @@ -225,6 +228,7 @@ where embedders, field_distribution, document_ids, + modified_docids, )?; Ok(()) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 707599ba3..c4c046360 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -121,6 +121,7 @@ pub(super) fn update_index( embedders: EmbeddingConfigs, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, + modified_docids: roaring::RoaringBitmap, ) -> Result<()> { index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { @@ -132,6 +133,7 @@ pub(super) fn update_index( index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + index.update_documents_stats(wtxn, modified_docids)?; Ok(()) } From 285c72a9605529d4fcde6ddcbede4c20c4494379 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Feb 2025 16:36:58 +0100 Subject: [PATCH 07/12] Update Snapshots --- crates/index-scheduler/src/scheduler/test.rs | 6 +---- crates/meilisearch/src/routes/indexes/mod.rs | 5 ++-- .../tests/documents/delete_documents.rs | 3 --- crates/meilisearch/tests/dumps/mod.rs | 27 ++++++++++++++++++- crates/meilisearch/tests/stats/mod.rs | 16 +++++++++++ .../tests/upgrade/v1_12/v1_12_0.rs | 3 --- 6 files changed, 45 insertions(+), 15 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index ddce7b2e0..84112de08 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -913,11 +913,7 @@ fn create_and_list_index() { "documents_database_stats": { "numberOfEntries": 0, "totalKeySize": 0, - "totalValueSize": 0, - "maxKeySize": 0, - "maxValueSize": 0, - "minKeySize": 0, - "minValueSize": 0 + "totalValueSize": 0 }, "database_size": "[bytes]", "number_of_embeddings": 0, diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 6ccdb8e71..bbcb3674b 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -496,8 +496,6 @@ pub struct IndexStats { pub number_of_documents: u64, /// Size of the documents database, in bytes. pub raw_document_db_size: u64, - /// Maximum size of a document in the documents database. - pub max_document_size: u64, /// Average size of a document in the documents database. pub avg_document_size: u64, /// Whether or not the index is currently ingesting document @@ -518,7 +516,6 @@ impl From for IndexStats { IndexStats { number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), - max_document_size: stats.inner_stats.documents_database_stats.max_value_size(), avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, @@ -541,6 +538,8 @@ impl From for IndexStats { (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( { "numberOfDocuments": 10, + "rawDocumentDbSize": 10, + "avgDocumentSize": 10, "numberOfEmbeddings": 10, "numberOfEmbeddedDocuments": 10, "isIndexing": true, diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 34a2c8325..4dfe2cc79 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -161,7 +161,6 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 4, "rawDocumentDbSize": 42, - "maxDocumentSize": 13, "avgDocumentSize": 10, "isIndexing": false, "numberOfEmbeddings": 0, @@ -213,7 +212,6 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 2, "rawDocumentDbSize": 16, - "maxDocumentSize": 12, "avgDocumentSize": 8, "isIndexing": false, "numberOfEmbeddings": 0, @@ -284,7 +282,6 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 1, "rawDocumentDbSize": 12, - "maxDocumentSize": 12, "avgDocumentSize": 12, "isIndexing": false, "numberOfEmbeddings": 0, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index fc523b3d7..ef6e48f69 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -32,6 +32,8 @@ async fn import_dump_v1_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -188,7 +190,6 @@ async fn import_dump_v1_movie_with_settings() { { "numberOfDocuments": 53, "rawDocumentDbSize": 21965, - "maxDocumentSize": 743, "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, @@ -358,6 +359,8 @@ async fn import_dump_v1_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -523,6 +526,8 @@ async fn import_dump_v2_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -678,6 +683,8 @@ async fn import_dump_v2_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -843,6 +850,8 @@ async fn import_dump_v2_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1005,6 +1014,8 @@ async fn import_dump_v3_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1160,6 +1171,8 @@ async fn import_dump_v3_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1325,6 +1338,8 @@ async fn import_dump_v3_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1487,6 +1502,8 @@ async fn import_dump_v4_movie_raw() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1642,6 +1659,8 @@ async fn import_dump_v4_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1807,6 +1826,8 @@ async fn import_dump_v4_rubygems_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1976,6 +1997,8 @@ async fn import_dump_v5() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -2012,6 +2035,8 @@ async fn import_dump_v5() { @r###" { "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index bb10d2cd5..20a8eaef6 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -113,6 +113,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -136,6 +138,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, @@ -159,6 +163,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 2, @@ -183,6 +189,8 @@ async fn add_remove_embeddings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 1, @@ -231,6 +239,8 @@ async fn add_remove_embedded_documents() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -250,6 +260,8 @@ async fn add_remove_embedded_documents() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 13, + "avgDocumentSize": 13, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 1, @@ -281,6 +293,8 @@ async fn update_embedder_settings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -315,6 +329,8 @@ async fn update_embedder_settings() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 78ad9f7e4..cce90907d 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -135,7 +135,6 @@ async fn check_the_index_scheduler(server: &Server) { "kefir": { "numberOfDocuments": 1, "rawDocumentDbSize": 109, - "maxDocumentSize": 109, "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, @@ -220,7 +219,6 @@ async fn check_the_index_scheduler(server: &Server) { "kefir": { "numberOfDocuments": 1, "rawDocumentDbSize": 109, - "maxDocumentSize": 109, "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, @@ -242,7 +240,6 @@ async fn check_the_index_scheduler(server: &Server) { { "numberOfDocuments": 1, "rawDocumentDbSize": 109, - "maxDocumentSize": 109, "avgDocumentSize": 109, "isIndexing": false, "numberOfEmbeddings": 0, From 9505f15c85dbb450af02330e110d8ff28134f664 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Feb 2025 16:37:17 +0100 Subject: [PATCH 08/12] Dumpless upgrade --- crates/milli/src/update/upgrade/mod.rs | 11 +++++-- crates/milli/src/update/upgrade/v1_13.rs | 37 ++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 16f0eef7a..0ed67f2cb 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -3,7 +3,7 @@ mod v1_13; use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; -use v1_13::V1_13_0_To_Current; +use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Current}; use crate::progress::{Progress, VariableNameStep}; use crate::{Index, InternalError, Result}; @@ -28,13 +28,18 @@ pub fn upgrade( progress: Progress, ) -> Result { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = - &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; + let upgrade_functions: &[&dyn UpgradeIndex] = &[ + &V1_12_To_V1_12_3 {}, + &V1_12_3_To_V1_13_0 {}, + &V1_13_0_To_V1_13_1 {}, + &V1_13_1_To_Current {}, + ]; let start = match from { (1, 12, 0..=2) => 0, (1, 12, 3..) => 1, (1, 13, 0) => 2, + (1, 13, 1) => 3, // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 13, _) => return Ok(false), (major, minor, patch) => { diff --git a/crates/milli/src/update/upgrade/v1_13.rs b/crates/milli/src/update/upgrade/v1_13.rs index 52246a7f3..f1d56d9cb 100644 --- a/crates/milli/src/update/upgrade/v1_13.rs +++ b/crates/milli/src/update/upgrade/v1_13.rs @@ -2,13 +2,44 @@ use heed::RwTxn; use super::UpgradeIndex; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use crate::database_stats::DatabaseStats; use crate::progress::Progress; -use crate::{Index, Result}; +use crate::{make_enum_progress, Index, Result}; #[allow(non_camel_case_types)] -pub(super) struct V1_13_0_To_Current(); +pub(super) struct V1_13_0_To_V1_13_1(); -impl UpgradeIndex for V1_13_0_To_Current { +impl UpgradeIndex for V1_13_0_To_V1_13_1 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + progress: Progress, + ) -> Result { + make_enum_progress! { + enum DocumentsStats { + CreatingDocumentsStats, + } + }; + + // Create the new documents stats. + progress.update_progress(DocumentsStats::CreatingDocumentsStats); + let stats = DatabaseStats::new(index.documents.remap_types(), wtxn)?; + index.put_documents_stats(wtxn, stats)?; + + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 13, 1) + } +} + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_1_To_Current(); + +impl UpgradeIndex for V1_13_1_To_Current { fn upgrade( &self, _wtxn: &mut RwTxn, From 57b26f844131a2cc05118d8918c7f960aeeb3341 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Feb 2025 16:41:34 +0100 Subject: [PATCH 09/12] fix clippy --- crates/milli/src/database_stats.rs | 2 +- crates/milli/src/update/new/indexer/write.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index cd7adab4d..d97dc13ba 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -41,7 +41,7 @@ impl DatabaseStats { /// /// This function is used to update the stats of the database when some keys are modified. /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. - pub(crate) fn recompute<'a, I, K>( + pub(crate) fn recompute( mut stats: Self, database: Database, before_rtxn: &RoTxn<'_>, diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index c4c046360..16b129c9f 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -113,6 +113,7 @@ where Ok(()) } +#[allow(clippy::too_many_arguments)] pub(super) fn update_index( index: &Index, wtxn: &mut RwTxn<'_>, From 8ccd090f4010a16130e707d10f5701697a710def Mon Sep 17 00:00:00 2001 From: Many the fish Date: Wed, 26 Feb 2025 10:28:25 +0100 Subject: [PATCH 10/12] Update crates/milli/src/index.rs Co-authored-by: Tamo --- crates/milli/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 4d1e7a2b9..5f51c08ce 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -453,10 +453,10 @@ impl Index { /// Returns the stats of the documents database. pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - dbg!(self + self .main .remap_types::>() - .get(rtxn, main_key::DOCUMENTS_STATS)) + .get(rtxn, main_key::DOCUMENTS_STATS) } /* primary key */ From 50354872085c1d125718c6cec09a54085d95d17c Mon Sep 17 00:00:00 2001 From: Many the fish Date: Wed, 26 Feb 2025 10:28:51 +0100 Subject: [PATCH 11/12] Update crates/milli/src/index.rs Co-authored-by: Tamo --- crates/milli/src/index.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 5f51c08ce..c24f55a95 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -443,7 +443,6 @@ impl Index { wtxn: &mut RwTxn<'_>, stats: DatabaseStats, ) -> heed::Result<()> { - eprintln!("putting documents stats: {:?}", stats); self.main.remap_types::>().put( wtxn, main_key::DOCUMENTS_STATS, From df2fcac36c5e34e3808aedf8286da6112b8e41ea Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 26 Feb 2025 10:35:03 +0100 Subject: [PATCH 12/12] Fix fmt --- crates/milli/src/index.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index c24f55a95..645b1fef6 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -452,8 +452,7 @@ impl Index { /// Returns the stats of the documents database. pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - self - .main + self.main .remap_types::>() .get(rtxn, main_key::DOCUMENTS_STATS) }