From 05cc8c650cecaf1d7753803c8b8284b84a845476 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 19 Feb 2025 14:00:21 +0100 Subject: [PATCH] Expose the write channel congestion in the batches --- crates/dump/src/lib.rs | 1 + crates/index-scheduler/src/scheduler/mod.rs | 16 ++- .../src/scheduler/process_batch.rs | 29 +++-- .../src/scheduler/process_index_operation.rs | 118 ++++++++++-------- crates/meilisearch-types/src/batches.rs | 2 + crates/milli/src/lib.rs | 1 + crates/milli/src/update/mod.rs | 1 + .../milli/src/update/new/indexer/extract.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 12 +- crates/milli/src/update/new/indexer/write.rs | 43 ++++--- crates/milli/src/update/new/mod.rs | 1 + crates/milli/src/update/new/steps.rs | 4 +- 12 files changed, 138 insertions(+), 92 deletions(-) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index c3861f6a1..70213ce69 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -322,6 +322,7 @@ pub(crate) mod test { types: maplit::btreemap! { Kind::DocumentAdditionOrUpdate => 1 }, index_uids: maplit::btreemap! { "doggo".to_string() => 1 }, call_trace: Default::default(), + write_channel_congestion: None, }, enqueued_at: Some(BatchEnqueuedAt { earliest: datetime!(2022-11-11 0:00 UTC), diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index 08eb1c317..bcf53127b 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -215,14 +215,16 @@ impl IndexScheduler { let mut stop_scheduler_forever = false; let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; let mut canceled = RoaringBitmap::new(); + let mut congestion = None; match res { - Ok(tasks) => { + Ok((tasks, cong)) => { #[cfg(test)] self.breakpoint(crate::test_utils::Breakpoint::ProcessBatchSucceeded); let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32); progress.update_progress(task_progress_obj); + congestion = cong; let mut success = 0; let mut failure = 0; let mut canceled_by = None; @@ -339,9 +341,17 @@ impl IndexScheduler { // We must re-add the canceled task so they're part of the same batch. ids |= canceled; - let durations = progress.accumulated_durations(); + processing_batch.stats.call_trace = - durations.into_iter().map(|(k, v)| (k, v.into())).collect(); + progress.accumulated_durations().into_iter().map(|(k, v)| (k, v.into())).collect(); + processing_batch.stats.write_channel_congestion = congestion.map(|congestion| { + let mut congestion_info = serde_json::Map::new(); + congestion_info.insert("attempts".into(), congestion.attempts.into()); + congestion_info.insert("blocking_attempts".into(), congestion.blocking_attempts.into()); + congestion_info.insert("blocking_ratio".into(), congestion.congestion_ratio().into()); + congestion_info + }); + self.queue.write_batch(&mut wtxn, processing_batch, &ids)?; #[cfg(test)] diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 21233429c..8f3987bf6 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -5,7 +5,7 @@ use std::sync::atomic::Ordering; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; -use meilisearch_types::milli::{self}; +use meilisearch_types::milli::{self, ChannelCongestion}; use meilisearch_types::tasks::{Details, IndexSwap, KindWithContent, Status, Task}; use milli::update::Settings as MilliSettings; use roaring::RoaringBitmap; @@ -35,7 +35,7 @@ impl IndexScheduler { batch: Batch, current_batch: &mut ProcessingBatch, progress: Progress, - ) -> Result> { + ) -> Result<(Vec, Option)> { #[cfg(test)] { self.maybe_fail(crate::test_utils::FailureLocation::InsideProcessBatch)?; @@ -76,7 +76,7 @@ impl IndexScheduler { canceled_tasks.push(task); - Ok(canceled_tasks) + Ok((canceled_tasks, None)) } Batch::TaskDeletions(mut tasks) => { // 1. Retrieve the tasks that matched the query at enqueue-time. @@ -115,10 +115,14 @@ impl IndexScheduler { _ => unreachable!(), } } - Ok(tasks) + Ok((tasks, None)) + } + Batch::SnapshotCreation(tasks) => { + self.process_snapshot(progress, tasks).map(|tasks| (tasks, None)) + } + Batch::Dump(task) => { + self.process_dump_creation(progress, task).map(|tasks| (tasks, None)) } - Batch::SnapshotCreation(tasks) => self.process_snapshot(progress, tasks), - Batch::Dump(task) => self.process_dump_creation(progress, task), Batch::IndexOperation { op, must_create_index } => { let index_uid = op.index_uid().to_string(); let index = if must_create_index { @@ -135,7 +139,8 @@ impl IndexScheduler { .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; - let tasks = self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; + let (tasks, congestion) = + self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; { let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); @@ -166,7 +171,7 @@ impl IndexScheduler { ), } - Ok(tasks) + Ok((tasks, congestion)) } Batch::IndexCreation { index_uid, primary_key, task } => { progress.update_progress(CreateIndexProgress::CreatingTheIndex); @@ -234,7 +239,7 @@ impl IndexScheduler { ), } - Ok(vec![task]) + Ok((vec![task], None)) } Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { progress.update_progress(DeleteIndexProgress::DeletingTheIndex); @@ -268,7 +273,7 @@ impl IndexScheduler { }; } - Ok(tasks) + Ok((tasks, None)) } Batch::IndexSwap { mut task } => { progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap); @@ -316,7 +321,7 @@ impl IndexScheduler { } wtxn.commit()?; task.status = Status::Succeeded; - Ok(vec![task]) + Ok((vec![task], None)) } Batch::UpgradeDatabase { mut tasks } => { let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { @@ -346,7 +351,7 @@ impl IndexScheduler { task.error = None; } - Ok(tasks) + Ok((tasks, None)) } } } diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 630ab62e4..690fe2efd 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -5,7 +5,7 @@ use meilisearch_types::milli::documents::PrimaryKey; use meilisearch_types::milli::progress::Progress; use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; use meilisearch_types::milli::update::DocumentAdditionResult; -use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; +use meilisearch_types::milli::{self, ChannelCongestion, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::apply_settings_to_builder; use meilisearch_types::tasks::{Details, KindWithContent, Status, Task}; use meilisearch_types::Index; @@ -33,9 +33,8 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, progress: Progress, - ) -> Result> { + ) -> Result<(Vec, Option)> { let indexer_alloc = Bump::new(); - let started_processing_at = std::time::Instant::now(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); @@ -60,7 +59,7 @@ impl IndexScheduler { }; } - Ok(tasks) + Ok((tasks, None)) } IndexOperation::DocumentOperation { index_uid, primary_key, operations, mut tasks } => { progress.update_progress(DocumentOperationProgress::RetrievingConfig); @@ -173,21 +172,24 @@ impl IndexScheduler { } progress.update_progress(DocumentOperationProgress::Indexing); + let mut congestion = None; if tasks.iter().any(|res| res.error.is_none()) { - indexer::index( - index_wtxn, - index, - pool, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - primary_key, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &progress, - ) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, + ); let addition = DocumentAdditionResult { indexed_documents: candidates_count, @@ -199,7 +201,7 @@ impl IndexScheduler { tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } - Ok(tasks) + Ok((tasks, congestion)) } IndexOperation::DocumentEdition { index_uid, mut task } => { progress.update_progress(DocumentEditionProgress::RetrievingConfig); @@ -247,7 +249,7 @@ impl IndexScheduler { edited_documents: Some(0), }); - return Ok(vec![task]); + return Ok((vec![task], None)); } let rtxn = index.read_txn()?; @@ -262,6 +264,7 @@ impl IndexScheduler { let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; + let mut congestion = None; if task.error.is_none() { let local_pool; let indexer_config = self.index_mapper.indexer_config(); @@ -292,20 +295,22 @@ impl IndexScheduler { let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentEditionProgress::Indexing); - indexer::index( - index_wtxn, - index, - pool, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &document_changes, - embedders, - &|| must_stop_processing.get(), - &progress, - ) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); let addition = DocumentAdditionResult { indexed_documents: candidates_count, @@ -341,7 +346,7 @@ impl IndexScheduler { } } - Ok(vec![task]) + Ok((vec![task], congestion)) } IndexOperation::DocumentDeletion { mut tasks, index_uid } => { progress.update_progress(DocumentDeletionProgress::RetrievingConfig); @@ -408,7 +413,7 @@ impl IndexScheduler { } if to_delete.is_empty() { - return Ok(tasks); + return Ok((tasks, None)); } let rtxn = index.read_txn()?; @@ -422,6 +427,7 @@ impl IndexScheduler { PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; + let mut congestion = None; if !tasks.iter().all(|res| res.error.is_some()) { let local_pool; let indexer_config = self.index_mapper.indexer_config(); @@ -447,20 +453,22 @@ impl IndexScheduler { let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentDeletionProgress::Indexing); - indexer::index( - index_wtxn, - index, - pool, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &document_changes, - embedders, - &|| must_stop_processing.get(), - &progress, - ) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); let addition = DocumentAdditionResult { indexed_documents: candidates_count, @@ -472,7 +480,7 @@ impl IndexScheduler { tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } - Ok(tasks) + Ok((tasks, congestion)) } IndexOperation::Settings { index_uid, settings, mut tasks } => { progress.update_progress(SettingsProgress::RetrievingAndMergingTheSettings); @@ -497,7 +505,7 @@ impl IndexScheduler { ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - Ok(tasks) + Ok((tasks, None)) } IndexOperation::DocumentClearAndSetting { index_uid, @@ -505,7 +513,7 @@ impl IndexScheduler { settings, settings_tasks, } => { - let mut import_tasks = self.apply_index_operation( + let (mut import_tasks, _congestion) = self.apply_index_operation( index_wtxn, index, IndexOperation::DocumentClear { @@ -515,7 +523,7 @@ impl IndexScheduler { progress.clone(), )?; - let settings_tasks = self.apply_index_operation( + let (settings_tasks, _congestion) = self.apply_index_operation( index_wtxn, index, IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, @@ -524,7 +532,7 @@ impl IndexScheduler { let mut tasks = settings_tasks; tasks.append(&mut import_tasks); - Ok(tasks) + Ok((tasks, None)) } } } diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 05463b758..40309ce06 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -61,4 +61,6 @@ pub struct BatchStats { pub types: BTreeMap, pub index_uids: BTreeMap, pub call_trace: serde_json::Map, + #[serde(skip_serializing_if = "Option::is_none")] + pub write_channel_congestion: Option>, } diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index ea88d2b78..bb1532c1a 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -73,6 +73,7 @@ pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; +pub use self::update::ChannelCongestion; pub type Result = std::result::Result; diff --git a/crates/milli/src/update/mod.rs b/crates/milli/src/update/mod.rs index 68268db35..9a783ffd2 100644 --- a/crates/milli/src/update/mod.rs +++ b/crates/milli/src/update/mod.rs @@ -5,6 +5,7 @@ pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::*; pub use self::indexer_config::IndexerConfig; +pub use self::new::ChannelCongestion; pub use self::settings::{validate_embedding_settings, Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 53478f029..1606851cb 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -291,7 +291,7 @@ where &indexing_context.must_stop_processing, )?; } - indexing_context.progress.update_progress(IndexingStep::WritingToDatabase); + indexing_context.progress.update_progress(IndexingStep::TailWritingToDatabase); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); Result::Ok((facet_field_ids_delta, index_embeddings)) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 890191323..58c60d502 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -10,6 +10,7 @@ use hashbrown::HashMap; use heed::RwTxn; pub use partial_dump::PartialDump; pub use update_by_function::UpdateByFunction; +pub use write::ChannelCongestion; use write::{build_vectors, update_index, write_to_db}; use super::channel::*; @@ -53,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, -) -> Result<()> +) -> Result where DC: DocumentChanges<'pl>, MSP: Fn() -> bool + Sync, @@ -130,7 +131,7 @@ where let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; - thread::scope(|s| -> Result<()> { + let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); let embedders = &embedders; let finished_extraction = &finished_extraction; @@ -182,7 +183,8 @@ where let mut arroy_writers = arroy_writers?; - write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; + let congestion = + write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); @@ -210,7 +212,7 @@ where indexing_context.progress.update_progress(IndexingStep::Finalizing); - Ok(()) as Result<_> + Ok(congestion) as Result<_> })?; // required to into_inner the new_fields_ids_map @@ -227,5 +229,5 @@ where document_ids, )?; - Ok(()) + Ok(congestion) } diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 707599ba3..c7e449243 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -14,13 +14,13 @@ use crate::update::settings::InnerIndexSettings; use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; use crate::{Error, Index, InternalError, Result}; -pub(super) fn write_to_db( +pub fn write_to_db( mut writer_receiver: WriterBbqueueReceiver<'_>, finished_extraction: &AtomicBool, index: &Index, wtxn: &mut RwTxn<'_>, arroy_writers: &HashMap, -) -> Result<()> { +) -> Result { // Used by by the ArroySetVector to copy the embedding into an // aligned memory area, required by arroy to accept a new vector. let mut aligned_embedding = Vec::new(); @@ -75,21 +75,36 @@ pub(super) fn write_to_db( write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?; - let direct_attempts = writer_receiver.sent_messages_attempts(); - let blocking_attempts = writer_receiver.blocking_sent_messages_attempts(); - let congestion_pct = (blocking_attempts as f64 / direct_attempts as f64) * 100.0; - tracing::debug!( - "Channel congestion metrics - \ - Attempts: {direct_attempts}, \ - Blocked attempts: {blocking_attempts} \ - ({congestion_pct:.1}% congestion)" - ); + Ok(ChannelCongestion { + attempts: writer_receiver.sent_messages_attempts(), + blocking_attempts: writer_receiver.blocking_sent_messages_attempts(), + }) +} - Ok(()) +/// Stats exposing the congestion of a channel. +#[derive(Debug, Copy, Clone)] +pub struct ChannelCongestion { + /// Number of attempts to send a message into the bbqueue buffer. + pub attempts: usize, + /// Number of blocking attempts which require a retry. + pub blocking_attempts: usize, +} + +impl ChannelCongestion { + pub fn congestion_ratio(&self) -> f32 { + // tracing::debug!( + // "Channel congestion metrics - \ + // Attempts: {direct_attempts}, \ + // Blocked attempts: {blocking_attempts} \ + // ({congestion_pct:.1}% congestion)" + // ); + + self.blocking_attempts as f32 / self.attempts as f32 + } } #[tracing::instrument(level = "debug", skip_all, target = "indexing::vectors")] -pub(super) fn build_vectors( +pub fn build_vectors( index: &Index, wtxn: &mut RwTxn<'_>, index_embeddings: Vec, @@ -113,7 +128,7 @@ where Ok(()) } -pub(super) fn update_index( +pub fn update_index( index: &Index, wtxn: &mut RwTxn<'_>, new_fields_ids_map: FieldIdMapWithMetadata, diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index b7e08a461..81ff93e54 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -1,4 +1,5 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use indexer::ChannelCongestion; pub use merger::{ merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta, }; diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index 9eb7d376d..38964d8ec 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -14,7 +14,7 @@ pub enum IndexingStep { ExtractingWordProximity, ExtractingEmbeddings, WritingGeoPoints, - WritingToDatabase, + TailWritingToDatabase, WaitingForExtractors, WritingEmbeddingsToDatabase, PostProcessingFacets, @@ -32,7 +32,7 @@ impl Step for IndexingStep { IndexingStep::ExtractingWordProximity => "extracting word proximity", IndexingStep::ExtractingEmbeddings => "extracting embeddings", IndexingStep::WritingGeoPoints => "writing geo points", - IndexingStep::WritingToDatabase => "writing to database", + IndexingStep::TailWritingToDatabase => "tail writing to database", IndexingStep::WaitingForExtractors => "waiting for extractors", IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", IndexingStep::PostProcessingFacets => "post-processing facets",