Add embedders stats

This commit is contained in:
ManyTheFish 2025-02-12 11:37:47 +01:00
parent 70305b9f71
commit 41203f0931
4 changed files with 67 additions and 1 deletions

View File

@ -106,6 +106,12 @@ pub struct IndexStats {
/// are not returned to the disk after a deletion, this number is typically larger than /// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages. /// `used_database_size` that only includes the size of the used pages.
pub database_size: u64, pub database_size: u64,
/// Number of embeddings in the index.
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
pub number_of_embeddings: Option<u64>,
/// Number of embedded documents in the index.
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
pub number_of_embedded_documents: Option<u64>,
/// Size taken by the used pages of the index' DB, in bytes. /// Size taken by the used pages of the index' DB, in bytes.
/// ///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB, /// As the DB backend does not return to the disk the pages that are not currently used by the DB,
@ -130,8 +136,11 @@ impl IndexStats {
/// ///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
let arroy_stats = index.arroy_stats(rtxn)?;
Ok(IndexStats { Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?, number_of_documents: index.number_of_documents(rtxn)?,
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
number_of_embedded_documents: Some(arroy_stats.documents.len()),
database_size: index.on_disk_size()?, database_size: index.on_disk_size()?,
used_database_size: index.used_size()?, used_database_size: index.used_size()?,
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),

View File

@ -496,6 +496,12 @@ pub struct IndexStats {
pub number_of_documents: u64, pub number_of_documents: u64,
/// Whether or not the index is currently ingesting document /// Whether or not the index is currently ingesting document
pub is_indexing: bool, pub is_indexing: bool,
/// Number of embeddings in the index
#[serde(skip_serializing_if = "Option::is_none")]
pub number_of_embeddings: Option<u64>,
/// Number of embedded documents in the index
#[serde(skip_serializing_if = "Option::is_none")]
pub number_of_embedded_documents: Option<u64>,
/// Association of every field name with the number of times it occurs in the documents. /// Association of every field name with the number of times it occurs in the documents.
#[schema(value_type = HashMap<String, u64>)] #[schema(value_type = HashMap<String, u64>)]
pub field_distribution: FieldDistribution, pub field_distribution: FieldDistribution,
@ -506,6 +512,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
IndexStats { IndexStats {
number_of_documents: stats.inner_stats.number_of_documents, number_of_documents: stats.inner_stats.number_of_documents,
is_indexing: stats.is_indexing, is_indexing: stats.is_indexing,
number_of_embeddings: stats.inner_stats.number_of_embeddings,
number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,
field_distribution: stats.inner_stats.field_distribution, field_distribution: stats.inner_stats.field_distribution,
} }
} }
@ -524,6 +532,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
(status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
{ {
"numberOfDocuments": 10, "numberOfDocuments": 10,
"numberOfEmbeddings": 10,
"numberOfEmbeddedDocuments": 10,
"isIndexing": true, "isIndexing": true,
"fieldDistribution": { "fieldDistribution": {
"genre": 10, "genre": 10,

View File

@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec;
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
@ -1731,6 +1731,18 @@ impl Index {
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
} }
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
let mut stats = ArroyStats::default();
let embedding_configs = self.embedding_configs(rtxn)?;
for config in embedding_configs {
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
let reader =
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
reader.aggregate_stats(rtxn, &mut stats)?;
}
Ok(stats)
}
} }
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]

View File

@ -410,8 +410,43 @@ impl ArroyWrapper {
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> { fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
self.database.remap_data_type() self.database.remap_data_type()
} }
pub fn aggregate_stats(
&self,
rtxn: &RoTxn,
stats: &mut ArroyStats,
) -> Result<(), arroy::Error> {
if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len() as u64;
}
} else {
for reader in self.readers(rtxn, self.angular_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len() as u64;
}
}
Ok(())
}
} }
#[derive(Debug, Default, Clone)]
pub struct ArroyStats {
pub number_of_embeddings: u64,
pub documents: RoaringBitmap,
}
/// One or multiple embeddings stored consecutively in a flat vector. /// One or multiple embeddings stored consecutively in a flat vector.
pub struct Embeddings<F> { pub struct Embeddings<F> {
data: Vec<F>, data: Vec<F>,