From 41203f0931006dcf96d895f71a7c3b51f2d289a3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 11:37:47 +0100 Subject: [PATCH] Add embedders stats --- .../index-scheduler/src/index_mapper/mod.rs | 9 +++++ crates/meilisearch/src/routes/indexes/mod.rs | 10 ++++++ crates/milli/src/index.rs | 14 +++++++- crates/milli/src/vector/mod.rs | 35 +++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index dad73d4c6..17d683bbb 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -106,6 +106,12 @@ pub struct IndexStats { /// are not returned to the disk after a deletion, this number is typically larger than /// `used_database_size` that only includes the size of the used pages. pub database_size: u64, + /// Number of embeddings in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embeddings: Option, + /// Number of embedded documents in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embedded_documents: Option, /// Size taken by the used pages of the index' DB, in bytes. /// /// As the DB backend does not return to the disk the pages that are not currently used by the DB, @@ -130,8 +136,11 @@ impl IndexStats { /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { + let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { number_of_documents: index.number_of_documents(rtxn)?, + number_of_embeddings: Some(arroy_stats.number_of_embeddings), + number_of_embedded_documents: Some(arroy_stats.documents.len()), database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index a03d5f691..7ca8e407f 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -496,6 +496,12 @@ pub struct IndexStats { pub number_of_documents: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, + /// Number of embeddings in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embeddings: Option, + /// Number of embedded documents in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embedded_documents: Option, /// Association of every field name with the number of times it occurs in the documents. #[schema(value_type = HashMap)] pub field_distribution: FieldDistribution, @@ -506,6 +512,8 @@ impl From for IndexStats { IndexStats { number_of_documents: stats.inner_stats.number_of_documents, is_indexing: stats.is_indexing, + number_of_embeddings: stats.inner_stats.number_of_embeddings, + number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, field_distribution: stats.inner_stats.field_distribution, } } @@ -524,6 +532,8 @@ impl From for IndexStats { (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( { "numberOfDocuments": 10, + "numberOfEmbeddings": 10, + "numberOfEmbeddedDocuments": 10, "isIndexing": true, "fieldDistribution": { "genre": 10, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 944fb6cd4..0550965ed 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec; use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -1731,6 +1731,18 @@ impl Index { let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } + + pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { + let mut stats = ArroyStats::default(); + let embedding_configs = self.embedding_configs(rtxn)?; + for config in embedding_configs { + let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + reader.aggregate_stats(rtxn, &mut stats)?; + } + Ok(stats) + } } #[derive(Debug, Deserialize, Serialize)] diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 9ccd7341c..a8ae4a1d8 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -410,8 +410,43 @@ impl ArroyWrapper { fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } + + pub fn aggregate_stats( + &self, + rtxn: &RoTxn, + stats: &mut ArroyStats, + ) -> Result<(), arroy::Error> { + if self.quantized { + for reader in self.readers(rtxn, self.quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len() as u64; + } + } else { + for reader in self.readers(rtxn, self.angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len() as u64; + } + } + + Ok(()) + } } +#[derive(Debug, Default, Clone)] +pub struct ArroyStats { + pub number_of_embeddings: u64, + pub documents: RoaringBitmap, +} /// One or multiple embeddings stored consecutively in a flat vector. pub struct Embeddings { data: Vec,