From 075fcc2c0885d1b43acd31b7359e1957fdc27ff8 Mon Sep 17 00:00:00 2001 From: ThalusA <15985204+ThalusA@users.noreply.github.com> Date: Fri, 12 Jul 2024 01:20:01 +0200 Subject: [PATCH] optimize /indexes route Optimize indexes listings by not processing all indexes and only processing the ones that are used as the result of the pagination --- index-scheduler/src/index_mapper/mod.rs | 10 ++++++- index-scheduler/src/lib.rs | 27 +++++++++++++++++- meilisearch/src/routes/indexes/mod.rs | 37 ++++++++++++++++++------- 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/index-scheduler/src/index_mapper/mod.rs b/index-scheduler/src/index_mapper/mod.rs index 14908120c..56cc5935a 100644 --- a/index-scheduler/src/index_mapper/mod.rs +++ b/index-scheduler/src/index_mapper/mod.rs @@ -4,7 +4,7 @@ use std::time::Duration; use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; +use meilisearch_types::heed::{Database, Env, RoIter, RoTxn, RwTxn}; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -111,6 +111,8 @@ pub struct IndexStats { pub created_at: OffsetDateTime, /// Date of the last update of the index. pub updated_at: OffsetDateTime, + /// Primary key of the index. + pub primary_key: Option, } impl IndexStats { @@ -127,6 +129,7 @@ impl IndexStats { field_distribution: index.field_distribution(rtxn)?, created_at: index.created_at(rtxn)?, updated_at: index.updated_at(rtxn)?, + primary_key: index.primary_key(rtxn)?.map(String::from), }) } } @@ -416,6 +419,11 @@ impl IndexMapper { .collect() } + /// Return an iterator over the database entries which only lives as much as the transaction lives. + pub fn iter<'txn>(&self, rtxn: &'txn RoTxn) -> Result> { + self.index_mapping.iter(rtxn).map_err(Error::from) + } + /// Return the name of all indexes without opening them. pub fn index_names(&self, rtxn: &RoTxn) -> Result> { self.index_mapping diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index ca9bca820..54a894cfb 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -51,7 +51,7 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; -use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; +use meilisearch_types::heed::{self, Database, Env, PutFlags, RoIter, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; @@ -70,6 +70,7 @@ use uuid::Uuid; use crate::index_mapper::IndexMapper; use crate::utils::{check_index_swap_validity, clamp_to_page_size}; +use crate::uuid_codec::UuidCodec; pub(crate) type BEI128 = I128; @@ -415,6 +416,23 @@ impl IndexScheduler { } } +/// An owned type for database entries iterator and its transaction. +/// To get the inner iterator you should call .iter() on it. +pub struct IndexIterator<'txn> { + rtxn: RoTxn<'txn>, + index_mapper: &'txn IndexMapper, +} + +impl<'txn> IndexIterator<'txn> { + pub fn new(rtxn: RoTxn<'txn>, index_mapper: &'txn IndexMapper) -> IndexIterator<'txn> { + Self { rtxn, index_mapper } + } + + pub fn iter(&'txn self) -> Result> { + self.index_mapper.iter(&self.rtxn) + } +} + impl IndexScheduler { /// Create an index scheduler and start its run loop. pub fn new( @@ -691,6 +709,13 @@ impl IndexScheduler { self.index_mapper.try_for_each_index(&rtxn, f) } + /// Return an owned type for the database entries iterator. + /// You should call .iter() on it to get an iterator over the database entries. + pub fn iter(&self) -> Result { + let rtxn = self.env.read_txn()?; + Ok(IndexIterator::new(rtxn, &self.index_mapper)) + } + /// Return the task ids matched by the given query from the index scheduler's point of view. pub(crate) fn get_task_ids(&self, rtxn: &RoTxn, query: &Query) -> Result { let ProcessingTasks { diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 35b747ccf..c8a75d9f1 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -99,16 +99,33 @@ pub async fn list_indexes( ) -> Result { debug!(parameters = ?paginate, "List indexes"); let filters = index_scheduler.filters(); - let indexes: Vec> = - index_scheduler.try_for_each_index(|uid, index| -> Result, _> { - if !filters.is_index_authorized(uid) { - return Ok(None); - } - Ok(Some(IndexView::new(uid.to_string(), index)?)) - })?; - // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. - let indexes: Vec = indexes.into_iter().flatten().collect(); - let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter()); + let index_iterator = index_scheduler.iter()?; + let database_iterator = index_iterator.iter()?; + let indexes = database_iterator + .filter(|res| { + res.as_ref().map(|(name, _)| filters.is_index_authorized(name)).unwrap_or(false) + }) + .flat_map(|res| { + res.ok().and_then(|(name, _)| { + index_scheduler.index_stats(name).ok().map(|index| IndexView { + uid: name.to_string(), + created_at: index.inner_stats.created_at, + updated_at: index.inner_stats.updated_at, + primary_key: index.inner_stats.primary_key, + }) + }) + }); + // The previous indexes iterator doesn't have size_hint() filled. + // In order to find how many elements there is we must create a new iterator that will only + // filter the total authorized indexes that are valid, consume it and return the number of elements. + let index_iterator = index_scheduler.iter()?; + let database_iterator = index_iterator.iter()?; + let count = database_iterator + .filter(|res| { + res.as_ref().ok().map(|(name, _)| filters.is_index_authorized(name)).unwrap_or(false) + }) + .count(); + let ret = paginate.as_pagination().auto_paginate_unsized(count, indexes); debug!(returns = ?ret, "List indexes"); Ok(HttpResponse::Ok().json(ret))