mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-02-15 23:30:37 +08:00
Add embedders stats
This commit is contained in:
parent
70305b9f71
commit
41203f0931
@ -106,6 +106,12 @@ pub struct IndexStats {
|
|||||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||||
/// `used_database_size` that only includes the size of the used pages.
|
/// `used_database_size` that only includes the size of the used pages.
|
||||||
pub database_size: u64,
|
pub database_size: u64,
|
||||||
|
/// Number of embeddings in the index.
|
||||||
|
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
|
||||||
|
pub number_of_embeddings: Option<u64>,
|
||||||
|
/// Number of embedded documents in the index.
|
||||||
|
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
|
||||||
|
pub number_of_embedded_documents: Option<u64>,
|
||||||
/// Size taken by the used pages of the index' DB, in bytes.
|
/// Size taken by the used pages of the index' DB, in bytes.
|
||||||
///
|
///
|
||||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||||
@ -130,8 +136,11 @@ impl IndexStats {
|
|||||||
///
|
///
|
||||||
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
|
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
|
||||||
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
|
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
|
||||||
|
let arroy_stats = index.arroy_stats(rtxn)?;
|
||||||
Ok(IndexStats {
|
Ok(IndexStats {
|
||||||
number_of_documents: index.number_of_documents(rtxn)?,
|
number_of_documents: index.number_of_documents(rtxn)?,
|
||||||
|
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
|
||||||
|
number_of_embedded_documents: Some(arroy_stats.documents.len()),
|
||||||
database_size: index.on_disk_size()?,
|
database_size: index.on_disk_size()?,
|
||||||
used_database_size: index.used_size()?,
|
used_database_size: index.used_size()?,
|
||||||
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
|
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
|
||||||
|
@ -496,6 +496,12 @@ pub struct IndexStats {
|
|||||||
pub number_of_documents: u64,
|
pub number_of_documents: u64,
|
||||||
/// Whether or not the index is currently ingesting document
|
/// Whether or not the index is currently ingesting document
|
||||||
pub is_indexing: bool,
|
pub is_indexing: bool,
|
||||||
|
/// Number of embeddings in the index
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub number_of_embeddings: Option<u64>,
|
||||||
|
/// Number of embedded documents in the index
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub number_of_embedded_documents: Option<u64>,
|
||||||
/// Association of every field name with the number of times it occurs in the documents.
|
/// Association of every field name with the number of times it occurs in the documents.
|
||||||
#[schema(value_type = HashMap<String, u64>)]
|
#[schema(value_type = HashMap<String, u64>)]
|
||||||
pub field_distribution: FieldDistribution,
|
pub field_distribution: FieldDistribution,
|
||||||
@ -506,6 +512,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
|
|||||||
IndexStats {
|
IndexStats {
|
||||||
number_of_documents: stats.inner_stats.number_of_documents,
|
number_of_documents: stats.inner_stats.number_of_documents,
|
||||||
is_indexing: stats.is_indexing,
|
is_indexing: stats.is_indexing,
|
||||||
|
number_of_embeddings: stats.inner_stats.number_of_embeddings,
|
||||||
|
number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,
|
||||||
field_distribution: stats.inner_stats.field_distribution,
|
field_distribution: stats.inner_stats.field_distribution,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -524,6 +532,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
|
|||||||
(status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
|
(status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
|
||||||
{
|
{
|
||||||
"numberOfDocuments": 10,
|
"numberOfDocuments": 10,
|
||||||
|
"numberOfEmbeddings": 10,
|
||||||
|
"numberOfEmbeddedDocuments": 10,
|
||||||
"isIndexing": true,
|
"isIndexing": true,
|
||||||
"fieldDistribution": {
|
"fieldDistribution": {
|
||||||
"genre": 10,
|
"genre": 10,
|
||||||
|
@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec;
|
|||||||
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
|
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
|
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
||||||
@ -1731,6 +1731,18 @@ impl Index {
|
|||||||
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
|
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
|
||||||
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
|
||||||
|
let mut stats = ArroyStats::default();
|
||||||
|
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||||
|
for config in embedding_configs {
|
||||||
|
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||||
|
let reader =
|
||||||
|
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||||
|
reader.aggregate_stats(rtxn, &mut stats)?;
|
||||||
|
}
|
||||||
|
Ok(stats)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
@ -410,8 +410,43 @@ impl ArroyWrapper {
|
|||||||
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn aggregate_stats(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
stats: &mut ArroyStats,
|
||||||
|
) -> Result<(), arroy::Error> {
|
||||||
|
if self.quantized {
|
||||||
|
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||||
|
let reader = reader?;
|
||||||
|
let documents = reader.item_ids();
|
||||||
|
if documents.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
stats.documents |= documents;
|
||||||
|
stats.number_of_embeddings += documents.len() as u64;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for reader in self.readers(rtxn, self.angular_db()) {
|
||||||
|
let reader = reader?;
|
||||||
|
let documents = reader.item_ids();
|
||||||
|
if documents.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
stats.documents |= documents;
|
||||||
|
stats.number_of_embeddings += documents.len() as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone)]
|
||||||
|
pub struct ArroyStats {
|
||||||
|
pub number_of_embeddings: u64,
|
||||||
|
pub documents: RoaringBitmap,
|
||||||
|
}
|
||||||
/// One or multiple embeddings stored consecutively in a flat vector.
|
/// One or multiple embeddings stored consecutively in a flat vector.
|
||||||
pub struct Embeddings<F> {
|
pub struct Embeddings<F> {
|
||||||
data: Vec<F>,
|
data: Vec<F>,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user