2022-10-20 16:25:34 +08:00
/*!
This crate defines the index scheduler , which is responsible for :
1. Keeping references to meilisearch ' s indexes and mapping them to their
user - defined names .
2. Scheduling tasks given by the user and executing them , in batch if possible .
When an ` IndexScheduler ` is created , a new thread containing a reference to the
scheduler is created . This thread runs the scheduler ' s run loop , where the
scheduler waits to be woken up to process new tasks . It wakes up when :
1. it is launched for the first time
2. a new task is registered
3. a batch of tasks has been processed
It is only within this thread that the scheduler is allowed to process tasks .
On the other hand , the publicly accessible methods of the scheduler can be
called asynchronously from any thread . These methods can either query the
content of the scheduler or enqueue new tasks .
* /
2022-09-09 18:16:19 +08:00
mod autobatcher ;
2022-09-07 06:10:14 +08:00
mod batch ;
2022-09-06 22:43:59 +08:00
pub mod error ;
2023-06-23 04:56:44 +08:00
mod features ;
2022-09-14 18:35:33 +08:00
mod index_mapper ;
2022-10-10 18:57:17 +08:00
#[ cfg(test) ]
2022-10-25 16:23:14 +08:00
mod insta_snapshot ;
2023-01-12 00:30:50 +08:00
mod lru ;
2022-09-07 06:10:14 +08:00
mod utils ;
2023-10-25 16:49:50 +08:00
pub mod uuid_codec ;
2022-09-06 22:43:59 +08:00
2022-09-07 05:49:19 +08:00
pub type Result < T > = std ::result ::Result < T , Error > ;
2022-09-06 22:43:59 +08:00
pub type TaskId = u32 ;
2023-05-25 18:58:13 +08:00
use std ::collections ::{ BTreeMap , HashMap } ;
2023-11-29 21:27:50 +08:00
use std ::io ::{ self , BufReader , Read } ;
2022-10-24 19:32:46 +08:00
use std ::ops ::{ Bound , RangeBounds } ;
2023-02-15 19:30:46 +08:00
use std ::path ::{ Path , PathBuf } ;
2024-02-26 17:43:04 +08:00
use std ::sync ::atomic ::Ordering ::{ self , Relaxed } ;
use std ::sync ::atomic ::{ AtomicBool , AtomicU32 } ;
2022-10-03 22:15:10 +08:00
use std ::sync ::{ Arc , RwLock } ;
2022-10-24 20:16:14 +08:00
use std ::time ::Duration ;
2022-10-03 22:15:10 +08:00
2022-10-25 16:59:06 +08:00
use dump ::{ KindDump , TaskDump , UpdateFile } ;
pub use error ::Error ;
2023-06-23 04:56:44 +08:00
pub use features ::RoFeatures ;
2022-10-16 07:39:01 +08:00
use file_store ::FileStore ;
2023-11-29 20:09:04 +08:00
use flate2 ::bufread ::GzEncoder ;
use flate2 ::Compression ;
2022-10-05 22:48:43 +08:00
use meilisearch_types ::error ::ResponseError ;
2023-06-23 04:56:44 +08:00
use meilisearch_types ::features ::{ InstanceTogglableFeatures , RuntimeTogglableFeatures } ;
2023-11-23 01:21:19 +08:00
use meilisearch_types ::heed ::byteorder ::BE ;
use meilisearch_types ::heed ::types ::{ SerdeBincode , SerdeJson , Str , I128 } ;
use meilisearch_types ::heed ::{ self , Database , Env , PutFlags , RoTxn , RwTxn } ;
2022-10-25 16:59:06 +08:00
use meilisearch_types ::milli ::documents ::DocumentsBatchBuilder ;
2022-10-21 00:00:07 +08:00
use meilisearch_types ::milli ::update ::IndexerConfig ;
2023-12-13 22:38:44 +08:00
use meilisearch_types ::milli ::vector ::{ Embedder , EmbedderOptions , EmbeddingConfigs } ;
2023-02-24 02:31:57 +08:00
use meilisearch_types ::milli ::{ self , CboRoaringBitmapCodec , Index , RoaringBitmapCodec , BEU32 } ;
2023-11-28 22:08:13 +08:00
use meilisearch_types ::task_view ::TaskView ;
2022-10-25 16:59:06 +08:00
use meilisearch_types ::tasks ::{ Kind , KindWithContent , Status , Task } ;
2024-02-26 17:43:04 +08:00
use rayon ::current_num_threads ;
2024-02-22 21:56:22 +08:00
use rayon ::prelude ::{ IntoParallelIterator , ParallelIterator } ;
2022-10-03 22:15:10 +08:00
use roaring ::RoaringBitmap ;
use synchronoise ::SignalEvent ;
2023-04-26 19:55:02 +08:00
use time ::format_description ::well_known ::Rfc3339 ;
2022-10-03 22:15:10 +08:00
use time ::OffsetDateTime ;
2022-10-25 22:10:14 +08:00
use utils ::{ filter_out_references_to_newer_tasks , keep_tasks_within_datetimes , map_bound } ;
2022-10-03 22:15:10 +08:00
use uuid ::Uuid ;
use crate ::index_mapper ::IndexMapper ;
2022-11-29 17:38:27 +08:00
use crate ::utils ::{ check_index_swap_validity , clamp_to_page_size } ;
2022-10-03 22:15:10 +08:00
2023-11-23 01:21:19 +08:00
pub ( crate ) type BEI128 = I128 < BE > ;
2022-10-19 18:59:12 +08:00
2022-10-20 16:25:34 +08:00
/// Defines a subset of tasks to be retrieved from the [`IndexScheduler`].
///
/// An empty/default query (where each field is set to `None`) matches all tasks.
/// Each non-null field restricts the set of tasks further.
2022-10-19 22:07:04 +08:00
#[ derive(Default, Debug, Clone, PartialEq, Eq) ]
2022-10-03 22:15:10 +08:00
pub struct Query {
2022-10-20 16:25:34 +08:00
/// The maximum number of tasks to be matched
2022-10-13 18:48:23 +08:00
pub limit : Option < u32 > ,
2022-10-20 16:25:34 +08:00
/// The minimum [task id](`meilisearch_types::tasks::Task::uid`) to be matched
2022-10-03 22:15:10 +08:00
pub from : Option < u32 > ,
2022-10-20 16:25:34 +08:00
/// The allowed [statuses](`meilisearch_types::tasks::Task::status`) of the matched tasls
2022-11-28 23:27:41 +08:00
pub statuses : Option < Vec < Status > > ,
2022-10-20 16:25:34 +08:00
/// The allowed [kinds](meilisearch_types::tasks::Kind) of the matched tasks.
///
/// The kind of a task is given by:
/// ```
/// # use meilisearch_types::tasks::{Task, Kind};
/// # fn doc_func(task: Task) -> Kind {
/// task.kind.as_kind()
/// # }
/// ```
2022-11-28 23:27:41 +08:00
pub types : Option < Vec < Kind > > ,
2022-10-20 16:25:34 +08:00
/// The allowed [index ids](meilisearch_types::tasks::Task::index_uid) of the matched tasks
2022-11-28 23:27:41 +08:00
pub index_uids : Option < Vec < String > > ,
2022-10-20 16:25:34 +08:00
/// The [task ids](`meilisearch_types::tasks::Task::uid`) to be matched
2022-11-28 23:27:41 +08:00
pub uids : Option < Vec < TaskId > > ,
/// The [task ids](`meilisearch_types::tasks::Task::uid`) of the [`TaskCancelation`](meilisearch_types::tasks::Task::Kind::TaskCancelation) tasks
/// that canceled the matched tasks.
pub canceled_by : Option < Vec < TaskId > > ,
2022-10-20 16:25:34 +08:00
/// Exclusive upper bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field.
2022-10-19 18:59:12 +08:00
pub before_enqueued_at : Option < OffsetDateTime > ,
2022-10-20 16:25:34 +08:00
/// Exclusive lower bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field.
2022-10-19 18:59:12 +08:00
pub after_enqueued_at : Option < OffsetDateTime > ,
2022-10-20 16:25:34 +08:00
/// Exclusive upper bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field.
2022-10-19 18:59:12 +08:00
pub before_started_at : Option < OffsetDateTime > ,
2022-10-20 16:25:34 +08:00
/// Exclusive lower bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field.
2022-10-19 18:59:12 +08:00
pub after_started_at : Option < OffsetDateTime > ,
2022-10-20 16:25:34 +08:00
/// Exclusive upper bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field.
2022-10-19 18:59:12 +08:00
pub before_finished_at : Option < OffsetDateTime > ,
2022-10-20 16:25:34 +08:00
/// Exclusive lower bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field.
2022-10-19 18:59:12 +08:00
pub after_finished_at : Option < OffsetDateTime > ,
2022-10-03 22:15:10 +08:00
}
impl Query {
2022-10-27 22:23:50 +08:00
/// Return `true` if every field of the query is set to `None`, such that the query
2022-10-20 16:25:34 +08:00
/// matches all tasks.
2022-10-15 17:17:06 +08:00
pub fn is_empty ( & self ) -> bool {
matches! (
self ,
Query {
limit : None ,
from : None ,
2022-11-28 23:27:41 +08:00
statuses : None ,
types : None ,
index_uids : None ,
uids : None ,
canceled_by : None ,
2022-10-19 18:59:12 +08:00
before_enqueued_at : None ,
after_enqueued_at : None ,
before_started_at : None ,
after_started_at : None ,
before_finished_at : None ,
after_finished_at : None ,
2022-10-15 17:17:06 +08:00
}
)
}
2022-10-03 22:15:10 +08:00
2022-10-20 16:25:34 +08:00
/// Add an [index id](meilisearch_types::tasks::Task::index_uid) to the list of permitted indexes.
2022-10-03 22:15:10 +08:00
pub fn with_index ( self , index_uid : String ) -> Self {
2022-11-28 23:27:41 +08:00
let mut index_vec = self . index_uids . unwrap_or_default ( ) ;
2022-10-03 22:15:10 +08:00
index_vec . push ( index_uid ) ;
2022-11-28 23:27:41 +08:00
Self { index_uids : Some ( index_vec ) , .. self }
2022-10-03 22:15:10 +08:00
}
2023-07-05 16:58:10 +08:00
// Removes the `from` and `limit` restrictions from the query.
// Useful to get the total number of tasks matching a filter.
pub fn without_limits ( self ) -> Self {
Query { limit : None , from : None , .. self }
}
2022-10-03 22:15:10 +08:00
}
2022-10-17 19:54:35 +08:00
#[ derive(Debug, Clone) ]
struct ProcessingTasks {
/// The date and time at which the indexation started.
started_at : OffsetDateTime ,
/// The list of tasks ids that are currently running.
processing : RoaringBitmap ,
}
impl ProcessingTasks {
2022-10-19 17:26:55 +08:00
/// Creates an empty `ProcessingAt` struct.
fn new ( ) -> ProcessingTasks {
2022-10-21 00:00:07 +08:00
ProcessingTasks { started_at : OffsetDateTime ::now_utc ( ) , processing : RoaringBitmap ::new ( ) }
2022-10-19 17:26:55 +08:00
}
2022-10-19 17:22:59 +08:00
/// Stores the currently processing tasks, and the date time at which it started.
2022-10-17 19:54:35 +08:00
fn start_processing_at ( & mut self , started_at : OffsetDateTime , processing : RoaringBitmap ) {
self . started_at = started_at ;
self . processing = processing ;
}
2022-11-28 23:27:41 +08:00
/// Set the processing tasks to an empty list
2023-11-27 22:11:22 +08:00
fn stop_processing ( & mut self ) -> RoaringBitmap {
std ::mem ::take ( & mut self . processing )
2022-10-17 19:54:35 +08:00
}
2022-11-28 23:27:41 +08:00
/// Returns `true` if there, at least, is one task that is currently processing that we must stop.
2022-10-19 17:22:59 +08:00
fn must_cancel_processing_tasks ( & self , canceled_tasks : & RoaringBitmap ) -> bool {
! self . processing . is_disjoint ( canceled_tasks )
}
}
#[ derive(Default, Clone, Debug) ]
struct MustStopProcessing ( Arc < AtomicBool > ) ;
impl MustStopProcessing {
fn get ( & self ) -> bool {
self . 0. load ( Relaxed )
}
fn must_stop ( & self ) {
self . 0. store ( true , Relaxed ) ;
}
fn reset ( & self ) {
self . 0. store ( false , Relaxed ) ;
2022-10-17 19:54:35 +08:00
}
}
2022-10-03 22:15:10 +08:00
/// Database const names for the `IndexScheduler`.
mod db_name {
pub const ALL_TASKS : & str = " all-tasks " ;
pub const STATUS : & str = " status " ;
pub const KIND : & str = " kind " ;
pub const INDEX_TASKS : & str = " index-tasks " ;
2022-11-28 23:27:41 +08:00
pub const CANCELED_BY : & str = " canceled_by " ;
2022-10-19 18:59:12 +08:00
pub const ENQUEUED_AT : & str = " enqueued-at " ;
pub const STARTED_AT : & str = " started-at " ;
pub const FINISHED_AT : & str = " finished-at " ;
2022-10-03 22:15:10 +08:00
}
2022-10-26 17:41:59 +08:00
#[ cfg(test) ]
#[ derive(Debug, Clone, Copy, PartialEq, Eq) ]
pub enum Breakpoint {
2022-11-28 23:27:41 +08:00
// this state is only encountered while creating the scheduler in the test suite.
Init ,
2022-10-26 17:41:59 +08:00
Start ,
BatchCreated ,
BeforeProcessing ,
AfterProcessing ,
AbortedIndexation ,
ProcessBatchSucceeded ,
ProcessBatchFailed ,
InsideProcessBatch ,
}
#[ derive(Debug) ]
pub struct IndexSchedulerOptions {
/// The path to the version file of Meilisearch.
2022-10-26 17:47:49 +08:00
pub version_file_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the folder containing the auth LMDB env.
2022-10-26 17:47:49 +08:00
pub auth_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the folder containing the task databases.
2022-10-26 17:47:49 +08:00
pub tasks_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the file store containing the files associated to the tasks.
2022-10-26 17:47:49 +08:00
pub update_file_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the folder containing meilisearch's indexes.
2022-10-26 17:47:49 +08:00
pub indexes_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the folder containing the snapshots.
2022-10-26 17:47:49 +08:00
pub snapshots_path : PathBuf ,
2022-10-26 17:41:59 +08:00
/// The path to the folder containing the dumps.
2022-10-26 17:47:49 +08:00
pub dumps_path : PathBuf ,
2023-12-19 19:18:45 +08:00
/// The URL on which we must send the tasks statuses
2023-11-27 22:11:22 +08:00
pub webhook_url : Option < String > ,
2023-12-19 19:18:45 +08:00
/// The value we will send into the Authorization HTTP header on the webhook URL
pub webhook_authorization_header : Option < String > ,
2022-12-26 18:41:31 +08:00
/// The maximum size, in bytes, of the task index.
2022-10-26 17:47:49 +08:00
pub task_db_size : usize ,
2023-01-12 00:34:46 +08:00
/// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index.
pub index_base_map_size : usize ,
2023-05-15 21:08:28 +08:00
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
2023-05-15 17:23:58 +08:00
pub enable_mdb_writemap : bool ,
2023-01-12 00:34:46 +08:00
/// The size, in bytes, by which the map size of an index is increased when it resized due to being full.
pub index_growth_amount : usize ,
/// The number of indexes that can be concurrently opened in memory.
pub index_count : usize ,
2022-10-26 17:41:59 +08:00
/// Configuration used during indexing for each meilisearch index.
2022-10-26 17:47:49 +08:00
pub indexer_config : IndexerConfig ,
2022-10-26 17:41:59 +08:00
/// Set to `true` iff the index scheduler is allowed to automatically
/// batch tasks together, to process multiple tasks at once.
2022-10-26 17:47:49 +08:00
pub autobatching_enabled : bool ,
2024-02-20 19:16:50 +08:00
/// Set to `true` iff the index scheduler is allowed to automatically
/// delete the finished tasks when there are too many tasks.
pub cleanup_enabled : bool ,
2023-04-25 23:26:34 +08:00
/// The maximum number of tasks stored in the task queue before starting
/// to auto schedule task deletions.
pub max_number_of_tasks : usize ,
2023-12-11 23:08:39 +08:00
/// If the autobatcher is allowed to automatically batch tasks
/// it will only batch this defined number of tasks at once.
pub max_number_of_batched_tasks : usize ,
2023-06-23 04:56:44 +08:00
/// The experimental features enabled for this instance.
pub instance_features : InstanceTogglableFeatures ,
2022-10-26 17:41:59 +08:00
}
2022-10-20 16:25:34 +08:00
/// Structure which holds meilisearch's indexes and schedules the tasks
/// to be performed on them.
2022-10-03 22:15:10 +08:00
pub struct IndexScheduler {
/// The LMDB environment which the DBs are associated with.
pub ( crate ) env : Env ,
2022-10-19 17:22:59 +08:00
/// A boolean that can be set to true to stop the currently processing tasks.
pub ( crate ) must_stop_processing : MustStopProcessing ,
2022-10-20 16:25:34 +08:00
/// The list of tasks currently processing
2022-10-17 19:54:35 +08:00
pub ( crate ) processing_tasks : Arc < RwLock < ProcessingTasks > > ,
2022-10-20 16:25:34 +08:00
/// The list of files referenced by the tasks
2022-10-17 19:54:35 +08:00
pub ( crate ) file_store : FileStore ,
2022-10-03 22:15:10 +08:00
// The main database, it contains all the tasks accessible by their Id.
2023-11-23 01:21:19 +08:00
pub ( crate ) all_tasks : Database < BEU32 , SerdeJson < Task > > ,
2022-10-03 22:15:10 +08:00
/// All the tasks ids grouped by their status.
2022-10-17 23:19:17 +08:00
// TODO we should not be able to serialize a `Status::Processing` in this database.
2022-10-03 22:15:10 +08:00
pub ( crate ) status : Database < SerdeBincode < Status > , RoaringBitmapCodec > ,
/// All the tasks ids grouped by their kind.
pub ( crate ) kind : Database < SerdeBincode < Kind > , RoaringBitmapCodec > ,
/// Store the tasks associated to an index.
pub ( crate ) index_tasks : Database < Str , RoaringBitmapCodec > ,
2022-11-28 23:27:41 +08:00
/// Store the tasks that were canceled by a task uid
2023-11-23 01:21:19 +08:00
pub ( crate ) canceled_by : Database < BEU32 , RoaringBitmapCodec > ,
2022-11-28 23:27:41 +08:00
2022-10-19 18:59:12 +08:00
/// Store the task ids of tasks which were enqueued at a specific date
2023-11-23 01:21:19 +08:00
pub ( crate ) enqueued_at : Database < BEI128 , CboRoaringBitmapCodec > ,
2022-10-19 18:59:12 +08:00
/// Store the task ids of finished tasks which started being processed at a specific date
2023-11-23 01:21:19 +08:00
pub ( crate ) started_at : Database < BEI128 , CboRoaringBitmapCodec > ,
2022-10-19 18:59:12 +08:00
/// Store the task ids of tasks which finished at a specific date
2023-11-23 01:21:19 +08:00
pub ( crate ) finished_at : Database < BEI128 , CboRoaringBitmapCodec > ,
2022-10-19 18:59:12 +08:00
2022-10-03 22:15:10 +08:00
/// In charge of creating, opening, storing and returning indexes.
pub ( crate ) index_mapper : IndexMapper ,
2023-06-23 04:56:44 +08:00
/// In charge of fetching and setting the status of experimental features.
features : features ::FeatureData ,
2022-10-03 22:15:10 +08:00
/// Get a signal when a batch needs to be processed.
pub ( crate ) wake_up : Arc < SignalEvent > ,
2022-10-17 19:54:35 +08:00
/// Whether auto-batching is enabled or not.
2022-10-10 23:00:56 +08:00
pub ( crate ) autobatching_enabled : bool ,
2024-02-20 19:16:50 +08:00
/// Whether we should automatically cleanup the task queue or not.
pub ( crate ) cleanup_enabled : bool ,
2023-04-25 23:26:34 +08:00
/// The max number of tasks allowed before the scheduler starts to delete
/// the finished tasks automatically.
pub ( crate ) max_number_of_tasks : usize ,
2023-12-11 23:08:39 +08:00
/// The maximum number of tasks that will be batched together.
pub ( crate ) max_number_of_batched_tasks : usize ,
2023-11-27 22:11:22 +08:00
/// The webhook url we should send tasks to after processing every batches.
pub ( crate ) webhook_url : Option < String > ,
2023-12-19 19:18:45 +08:00
/// The Authorization header to send to the webhook URL.
pub ( crate ) webhook_authorization_header : Option < String > ,
2023-11-27 22:11:22 +08:00
2022-10-13 21:02:59 +08:00
/// The path used to create the dumps.
pub ( crate ) dumps_path : PathBuf ,
2022-10-25 16:53:25 +08:00
/// The path used to create the snapshots.
pub ( crate ) snapshots_path : PathBuf ,
2022-10-25 20:35:10 +08:00
/// The path to the folder containing the auth LMDB env.
pub ( crate ) auth_path : PathBuf ,
2022-10-25 21:06:28 +08:00
/// The path to the version file of Meilisearch.
pub ( crate ) version_file_path : PathBuf ,
2023-12-12 19:08:36 +08:00
embedders : Arc < RwLock < HashMap < EmbedderOptions , Arc < Embedder > > > > ,
2023-11-15 22:46:37 +08:00
2022-10-03 22:15:10 +08:00
// ================= test
2022-10-25 15:48:51 +08:00
// The next entry is dedicated to the tests.
/// Provide a way to set a breakpoint in multiple part of the scheduler.
///
/// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation.
2022-10-03 22:15:10 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
test_breakpoint_sdr : crossbeam ::channel ::Sender < ( Breakpoint , bool ) > ,
/// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler.
///
/// The first field is the iteration index and the second field identifies a location in the code.
2022-10-26 23:31:23 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
planned_failures : Vec < ( usize , tests ::FailureLocation ) > ,
/// A counter that is incremented before every call to [`tick`](IndexScheduler::tick)
2022-10-26 23:31:23 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
run_loop_iteration : Arc < RwLock < usize > > ,
}
2022-10-26 17:41:59 +08:00
2022-10-20 23:11:44 +08:00
impl IndexScheduler {
2022-10-25 16:53:25 +08:00
fn private_clone ( & self ) -> IndexScheduler {
IndexScheduler {
2022-10-20 23:11:44 +08:00
env : self . env . clone ( ) ,
must_stop_processing : self . must_stop_processing . clone ( ) ,
processing_tasks : self . processing_tasks . clone ( ) ,
file_store : self . file_store . clone ( ) ,
2022-10-25 16:53:29 +08:00
all_tasks : self . all_tasks ,
status : self . status ,
kind : self . kind ,
index_tasks : self . index_tasks ,
2022-11-28 23:27:41 +08:00
canceled_by : self . canceled_by ,
2022-10-25 16:53:29 +08:00
enqueued_at : self . enqueued_at ,
started_at : self . started_at ,
finished_at : self . finished_at ,
2022-10-20 23:11:44 +08:00
index_mapper : self . index_mapper . clone ( ) ,
wake_up : self . wake_up . clone ( ) ,
2022-10-25 16:53:29 +08:00
autobatching_enabled : self . autobatching_enabled ,
2024-02-20 19:16:50 +08:00
cleanup_enabled : self . cleanup_enabled ,
2023-04-25 23:26:34 +08:00
max_number_of_tasks : self . max_number_of_tasks ,
2023-12-11 23:08:39 +08:00
max_number_of_batched_tasks : self . max_number_of_batched_tasks ,
2022-10-25 16:53:25 +08:00
snapshots_path : self . snapshots_path . clone ( ) ,
2022-10-20 23:11:44 +08:00
dumps_path : self . dumps_path . clone ( ) ,
2022-10-25 20:35:10 +08:00
auth_path : self . auth_path . clone ( ) ,
2022-10-25 21:06:28 +08:00
version_file_path : self . version_file_path . clone ( ) ,
2023-11-27 22:11:22 +08:00
webhook_url : self . webhook_url . clone ( ) ,
2023-12-19 19:18:45 +08:00
webhook_authorization_header : self . webhook_authorization_header . clone ( ) ,
2023-11-15 22:46:37 +08:00
embedders : self . embedders . clone ( ) ,
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
test_breakpoint_sdr : self . test_breakpoint_sdr . clone ( ) ,
#[ cfg(test) ]
planned_failures : self . planned_failures . clone ( ) ,
#[ cfg(test) ]
run_loop_iteration : self . run_loop_iteration . clone ( ) ,
2023-06-23 04:56:44 +08:00
features : self . features . clone ( ) ,
2022-10-20 23:11:44 +08:00
}
}
2022-10-03 22:15:10 +08:00
}
impl IndexScheduler {
2022-10-20 16:25:34 +08:00
/// Create an index scheduler and start its run loop.
2022-10-03 22:15:10 +08:00
pub fn new (
2022-10-26 17:41:59 +08:00
options : IndexSchedulerOptions ,
2022-10-20 23:11:44 +08:00
#[ cfg(test) ] test_breakpoint_sdr : crossbeam ::channel ::Sender < ( Breakpoint , bool ) > ,
#[ cfg(test) ] planned_failures : Vec < ( usize , tests ::FailureLocation ) > ,
2022-10-03 22:15:10 +08:00
) -> Result < Self > {
2022-10-26 17:41:59 +08:00
std ::fs ::create_dir_all ( & options . tasks_path ) ? ;
std ::fs ::create_dir_all ( & options . update_file_path ) ? ;
std ::fs ::create_dir_all ( & options . indexes_path ) ? ;
std ::fs ::create_dir_all ( & options . dumps_path ) ? ;
2022-10-03 22:15:10 +08:00
2023-05-15 17:23:58 +08:00
if cfg! ( windows ) & & options . enable_mdb_writemap {
2023-05-15 21:08:28 +08:00
// programmer error if this happens: in normal use passing the option on Windows is an error in main
2023-05-15 17:23:58 +08:00
panic! ( " Windows doesn't support the MDB_WRITEMAP LMDB option " ) ;
}
2023-02-15 19:30:46 +08:00
let task_db_size = clamp_to_page_size ( options . task_db_size ) ;
2023-02-15 19:31:14 +08:00
let budget = if options . indexer_config . skip_index_budget {
IndexBudget {
2023-02-15 19:30:46 +08:00
map_size : options . index_base_map_size ,
index_count : options . index_count ,
task_db_size ,
2023-02-15 19:31:14 +08:00
}
} else {
Self ::index_budget (
& options . tasks_path ,
options . index_base_map_size ,
task_db_size ,
options . index_count ,
)
} ;
2023-02-15 19:30:46 +08:00
2024-05-16 22:10:55 +08:00
let env = unsafe {
heed ::EnvOpenOptions ::new ( )
. max_dbs ( 11 )
. map_size ( budget . task_db_size )
. open ( options . tasks_path )
} ? ;
2023-06-23 04:56:44 +08:00
let features = features ::FeatureData ::new ( & env , options . instance_features ) ? ;
2022-10-26 17:41:59 +08:00
let file_store = FileStore ::new ( & options . update_file_path ) ? ;
2022-10-03 22:15:10 +08:00
2023-05-15 16:15:33 +08:00
let mut wtxn = env . write_txn ( ) ? ;
let all_tasks = env . create_database ( & mut wtxn , Some ( db_name ::ALL_TASKS ) ) ? ;
let status = env . create_database ( & mut wtxn , Some ( db_name ::STATUS ) ) ? ;
let kind = env . create_database ( & mut wtxn , Some ( db_name ::KIND ) ) ? ;
let index_tasks = env . create_database ( & mut wtxn , Some ( db_name ::INDEX_TASKS ) ) ? ;
let canceled_by = env . create_database ( & mut wtxn , Some ( db_name ::CANCELED_BY ) ) ? ;
let enqueued_at = env . create_database ( & mut wtxn , Some ( db_name ::ENQUEUED_AT ) ) ? ;
let started_at = env . create_database ( & mut wtxn , Some ( db_name ::STARTED_AT ) ) ? ;
let finished_at = env . create_database ( & mut wtxn , Some ( db_name ::FINISHED_AT ) ) ? ;
wtxn . commit ( ) ? ;
2022-10-03 22:15:10 +08:00
// allow unreachable_code to get rids of the warning in the case of a test build.
let this = Self {
2022-10-19 17:22:59 +08:00
must_stop_processing : MustStopProcessing ::default ( ) ,
2022-10-19 17:26:55 +08:00
processing_tasks : Arc ::new ( RwLock ::new ( ProcessingTasks ::new ( ) ) ) ,
2022-10-03 22:15:10 +08:00
file_store ,
2023-05-15 16:15:33 +08:00
all_tasks ,
status ,
kind ,
index_tasks ,
canceled_by ,
enqueued_at ,
started_at ,
finished_at ,
2022-10-26 17:41:59 +08:00
index_mapper : IndexMapper ::new (
& env ,
options . indexes_path ,
2023-02-15 19:30:46 +08:00
budget . map_size ,
2023-01-12 00:34:46 +08:00
options . index_growth_amount ,
2023-02-15 19:30:46 +08:00
budget . index_count ,
2023-05-15 17:23:58 +08:00
options . enable_mdb_writemap ,
2022-10-26 17:41:59 +08:00
options . indexer_config ,
) ? ,
2022-10-03 22:15:10 +08:00
env ,
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
wake_up : Arc ::new ( SignalEvent ::auto ( true ) ) ,
2022-10-26 17:41:59 +08:00
autobatching_enabled : options . autobatching_enabled ,
2024-02-20 19:16:50 +08:00
cleanup_enabled : options . cleanup_enabled ,
2023-04-25 23:26:34 +08:00
max_number_of_tasks : options . max_number_of_tasks ,
2023-12-11 23:08:39 +08:00
max_number_of_batched_tasks : options . max_number_of_batched_tasks ,
2022-10-26 17:41:59 +08:00
dumps_path : options . dumps_path ,
snapshots_path : options . snapshots_path ,
auth_path : options . auth_path ,
version_file_path : options . version_file_path ,
2023-11-27 22:11:22 +08:00
webhook_url : options . webhook_url ,
2023-12-19 19:18:45 +08:00
webhook_authorization_header : options . webhook_authorization_header ,
2023-11-15 22:46:37 +08:00
embedders : Default ::default ( ) ,
2022-10-03 22:15:10 +08:00
#[ cfg(test) ]
test_breakpoint_sdr ,
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
planned_failures ,
#[ cfg(test) ]
run_loop_iteration : Arc ::new ( RwLock ::new ( 0 ) ) ,
2023-06-23 04:56:44 +08:00
features ,
2022-10-03 22:15:10 +08:00
} ;
this . run ( ) ;
Ok ( this )
}
2023-04-06 19:38:47 +08:00
/// Return `Ok(())` if the index scheduler is able to access one of its database.
pub fn health ( & self ) -> Result < ( ) > {
let rtxn = self . env . read_txn ( ) ? ;
self . all_tasks . first ( & rtxn ) ? ;
Ok ( ( ) )
}
2023-02-15 19:30:46 +08:00
fn index_budget (
tasks_path : & Path ,
base_map_size : usize ,
mut task_db_size : usize ,
max_index_count : usize ,
) -> IndexBudget {
2023-02-28 00:13:07 +08:00
#[ cfg(windows) ]
const DEFAULT_BUDGET : usize = 6 * 1024 * 1024 * 1024 * 1024 ; // 6 TiB, 1 index
#[ cfg(not(windows)) ]
const DEFAULT_BUDGET : usize = 80 * 1024 * 1024 * 1024 * 1024 ; // 80 TiB, 18 indexes
let budget = if Self ::is_good_heed ( tasks_path , DEFAULT_BUDGET ) {
DEFAULT_BUDGET
} else {
2024-02-07 22:51:38 +08:00
tracing ::debug! ( " determining budget with dichotomic search " ) ;
2023-02-28 00:13:07 +08:00
utils ::dichotomic_search ( DEFAULT_BUDGET / 2 , | map_size | {
Self ::is_good_heed ( tasks_path , map_size )
} )
} ;
2023-02-15 19:30:46 +08:00
2024-02-07 22:51:38 +08:00
tracing ::debug! ( " memmap budget: {budget}B " ) ;
2023-02-15 19:30:46 +08:00
let mut budget = budget / 2 ;
if task_db_size > ( budget / 2 ) {
task_db_size = clamp_to_page_size ( budget * 2 / 5 ) ;
2024-02-07 22:51:38 +08:00
tracing ::debug! (
2023-02-15 19:30:46 +08:00
" Decreasing max size of task DB to {task_db_size}B due to constrained memory space "
) ;
}
budget - = task_db_size ;
// won't be mutated again
let budget = budget ;
let task_db_size = task_db_size ;
2024-02-07 22:51:38 +08:00
tracing ::debug! ( " index budget: {budget}B " ) ;
2023-02-15 19:30:46 +08:00
let mut index_count = budget / base_map_size ;
if index_count < 2 {
// take a bit less than half than the budget to make sure we can always afford to open an index
let map_size = ( budget * 2 ) / 5 ;
// single index of max budget
2024-02-07 22:51:38 +08:00
tracing ::debug! ( " 1 index of {map_size}B can be opened simultaneously. " ) ;
2023-02-15 19:30:46 +08:00
return IndexBudget { map_size , index_count : 1 , task_db_size } ;
}
// give us some space for an additional index when the cache is already full
// decrement is OK because index_count >= 2.
index_count - = 1 ;
if index_count > max_index_count {
index_count = max_index_count ;
}
2024-02-07 22:51:38 +08:00
tracing ::debug! ( " Up to {index_count} indexes of {base_map_size}B opened simultaneously. " ) ;
2023-02-15 19:30:46 +08:00
IndexBudget { map_size : base_map_size , index_count , task_db_size }
}
fn is_good_heed ( tasks_path : & Path , map_size : usize ) -> bool {
2024-05-16 22:10:55 +08:00
if let Ok ( env ) = unsafe {
2023-02-15 19:30:46 +08:00
heed ::EnvOpenOptions ::new ( ) . map_size ( clamp_to_page_size ( map_size ) ) . open ( tasks_path )
2024-05-16 22:10:55 +08:00
} {
2023-02-15 19:30:46 +08:00
env . prepare_for_closing ( ) . wait ( ) ;
true
} else {
// We're treating all errors equally here, not only allocation errors.
// This means there's a possiblity for the budget to lower due to errors different from allocation errors.
// For persistent errors, this is OK as long as the task db is then reopened normally without ignoring the error this time.
// For transient errors, this could lead to an instance with too low a budget.
// However transient errors are: 1) less likely than persistent errors 2) likely to cause other issues down the line anyway.
false
}
}
2022-10-27 17:17:50 +08:00
pub fn read_txn ( & self ) -> Result < RoTxn > {
self . env . read_txn ( ) . map_err ( | e | e . into ( ) )
}
2022-10-20 16:25:34 +08:00
/// Start the run loop for the given index scheduler.
///
/// This function will execute in a different thread and must be called
/// only once per index scheduler.
2022-10-03 22:15:10 +08:00
fn run ( & self ) {
2022-10-20 23:11:44 +08:00
let run = self . private_clone ( ) ;
2022-11-28 23:27:41 +08:00
std ::thread ::Builder ::new ( )
. name ( String ::from ( " scheduler " ) )
. spawn ( move | | {
#[ cfg(test) ]
run . breakpoint ( Breakpoint ::Init ) ;
2023-01-10 02:30:29 +08:00
run . wake_up . wait ( ) ;
2022-11-28 23:27:41 +08:00
2023-01-10 02:30:29 +08:00
loop {
2022-11-28 23:27:41 +08:00
match run . tick ( ) {
2023-01-10 02:30:29 +08:00
Ok ( TickOutcome ::TickAgain ( _ ) ) = > ( ) ,
Ok ( TickOutcome ::WaitForSignal ) = > run . wake_up . wait ( ) ,
2022-11-28 23:27:41 +08:00
Err ( e ) = > {
2024-02-07 22:51:38 +08:00
tracing ::error! ( " {e} " ) ;
2022-11-28 23:27:41 +08:00
// Wait one second when an irrecoverable error occurs.
2023-04-04 03:08:47 +08:00
if ! e . is_recoverable ( ) {
2022-11-28 23:27:41 +08:00
std ::thread ::sleep ( Duration ::from_secs ( 1 ) ) ;
}
}
2022-10-24 20:16:14 +08:00
}
}
2022-11-28 23:27:41 +08:00
} )
. unwrap ( ) ;
2022-10-03 22:15:10 +08:00
}
2022-10-16 07:39:01 +08:00
pub fn indexer_config ( & self ) -> & IndexerConfig {
& self . index_mapper . indexer_config
}
2023-05-26 00:30:30 +08:00
/// Return the real database size (i.e.: The size **with** the free pages)
2023-01-24 23:17:23 +08:00
pub fn size ( & self ) -> Result < u64 > {
Ok ( self . env . real_disk_size ( ) ? )
}
2023-05-26 00:30:30 +08:00
/// Return the used database size (i.e.: The size **without** the free pages)
pub fn used_size ( & self ) -> Result < u64 > {
Ok ( self . env . non_free_pages_size ( ) ? )
}
2022-10-20 16:25:34 +08:00
/// Return the index corresponding to the name.
///
/// * If the index wasn't opened before, the index will be opened.
/// * If the index doesn't exist on disk, the `IndexNotFoundError` is thrown.
2023-02-20 23:42:54 +08:00
///
/// ### Note
///
/// As an `Index` requires a large swath of the virtual memory address space, correct usage of an `Index` does not
/// keep its handle for too long.
///
/// Some configurations also can't reasonably open multiple indexes at once.
/// If you need to fetch information from or perform an action on all indexes,
/// see the `try_for_each_index` function.
2022-10-03 22:15:10 +08:00
pub fn index ( & self , name : & str ) -> Result < Index > {
let rtxn = self . env . read_txn ( ) ? ;
self . index_mapper . index ( & rtxn , name )
}
2023-02-20 23:42:54 +08:00
/// Return the name of all indexes without opening them.
2023-02-24 02:31:57 +08:00
pub fn index_names ( & self ) -> Result < Vec < String > > {
2023-02-20 23:42:54 +08:00
let rtxn = self . env . read_txn ( ) ? ;
self . index_mapper . index_names ( & rtxn )
}
/// Attempts `f` for each index that exists known to the index scheduler.
///
/// It is preferable to use this function rather than a loop that opens all indexes, as a way to avoid having all indexes opened,
/// which is unsupported in general.
///
/// Since `f` is allowed to return a result, and `Index` is cloneable, it is still possible to wrongly build e.g. a vector of
/// all the indexes, but this function makes it harder and so less likely to do accidentally.
///
/// If many indexes exist, this operation can take time to complete (in the order of seconds for a 1000 of indexes) as it needs to open
/// all the indexes.
pub fn try_for_each_index < U , V > ( & self , f : impl FnMut ( & str , & Index ) -> Result < U > ) -> Result < V >
where
V : FromIterator < U > ,
{
2022-10-03 22:15:10 +08:00
let rtxn = self . env . read_txn ( ) ? ;
2023-02-20 23:42:54 +08:00
self . index_mapper . try_for_each_index ( & rtxn , f )
2022-10-03 22:15:10 +08:00
}
2022-10-27 17:17:50 +08:00
/// Return the task ids matched by the given query from the index scheduler's point of view.
pub ( crate ) fn get_task_ids ( & self , rtxn : & RoTxn , query : & Query ) -> Result < RoaringBitmap > {
2022-11-28 23:27:41 +08:00
let ProcessingTasks {
started_at : started_at_processing , processing : processing_tasks , ..
} = self . processing_tasks . read ( ) . unwrap ( ) . clone ( ) ;
2022-10-03 22:15:10 +08:00
2022-10-27 19:00:30 +08:00
let mut tasks = self . all_task_ids ( rtxn ) ? ;
2022-10-03 22:15:10 +08:00
2022-10-26 18:56:01 +08:00
if let Some ( from ) = & query . from {
tasks . remove_range ( from . saturating_add ( 1 ) .. ) ;
}
2022-11-28 23:27:41 +08:00
if let Some ( status ) = & query . statuses {
2022-10-03 22:15:10 +08:00
let mut status_tasks = RoaringBitmap ::new ( ) ;
for status in status {
2022-10-24 19:32:46 +08:00
match status {
// special case for Processing tasks
Status ::Processing = > {
2022-10-25 15:48:51 +08:00
status_tasks | = & processing_tasks ;
2022-10-24 19:32:46 +08:00
}
2022-10-27 19:00:30 +08:00
status = > status_tasks | = & self . get_status ( rtxn , * status ) ? ,
2022-10-24 19:32:46 +08:00
} ;
}
2022-10-25 15:48:51 +08:00
if ! status . contains ( & Status ::Processing ) {
tasks - = & processing_tasks ;
2022-10-03 22:15:10 +08:00
}
tasks & = status_tasks ;
}
2022-11-28 23:27:41 +08:00
if let Some ( uids ) = & query . uids {
2022-10-24 19:32:46 +08:00
let uids = RoaringBitmap ::from_iter ( uids ) ;
tasks & = & uids ;
}
2022-11-28 23:27:41 +08:00
if let Some ( canceled_by ) = & query . canceled_by {
2023-01-18 22:25:27 +08:00
let mut all_canceled_tasks = RoaringBitmap ::new ( ) ;
2022-11-28 23:27:41 +08:00
for cancel_task_uid in canceled_by {
2023-11-23 19:20:44 +08:00
if let Some ( canceled_by_uid ) = self . canceled_by . get ( rtxn , cancel_task_uid ) ? {
2023-01-18 22:25:27 +08:00
all_canceled_tasks | = canceled_by_uid ;
2022-11-28 23:27:41 +08:00
}
}
2023-01-18 22:25:27 +08:00
// if the canceled_by has been specified but no task
// matches then we prefer matching zero than all tasks.
if all_canceled_tasks . is_empty ( ) {
return Ok ( RoaringBitmap ::new ( ) ) ;
} else {
tasks & = all_canceled_tasks ;
}
2022-11-28 23:27:41 +08:00
}
if let Some ( kind ) = & query . types {
2022-10-03 22:15:10 +08:00
let mut kind_tasks = RoaringBitmap ::new ( ) ;
for kind in kind {
2022-10-27 19:00:30 +08:00
kind_tasks | = self . get_kind ( rtxn , * kind ) ? ;
2022-10-03 22:15:10 +08:00
}
2022-10-24 19:32:46 +08:00
tasks & = & kind_tasks ;
2022-10-03 22:15:10 +08:00
}
2022-11-28 23:27:41 +08:00
if let Some ( index ) = & query . index_uids {
2022-10-03 22:15:10 +08:00
let mut index_tasks = RoaringBitmap ::new ( ) ;
for index in index {
2022-10-27 19:00:30 +08:00
index_tasks | = self . index_tasks ( rtxn , index ) ? ;
2022-10-03 22:15:10 +08:00
}
2022-10-24 19:32:46 +08:00
tasks & = & index_tasks ;
2022-10-03 22:15:10 +08:00
}
2022-10-17 22:30:18 +08:00
2022-10-24 19:32:46 +08:00
// For the started_at filter, we need to treat the part of the tasks that are processing from the part of the
// tasks that are not processing. The non-processing ones are filtered normally while the processing ones
// are entirely removed unless the in-memory startedAt variable falls within the date filter.
// Once we have filtered the two subsets, we put them back together and assign it back to `tasks`.
tasks = {
let ( mut filtered_non_processing_tasks , mut filtered_processing_tasks ) =
2022-10-25 15:48:51 +08:00
( & tasks - & processing_tasks , & tasks & & processing_tasks ) ;
2022-10-24 19:32:46 +08:00
// special case for Processing tasks
2022-10-25 15:48:51 +08:00
// A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds
let mut clear_filtered_processing_tasks =
| start : Bound < OffsetDateTime > , end : Bound < OffsetDateTime > | {
let start = map_bound ( start , | b | b . unix_timestamp_nanos ( ) ) ;
let end = map_bound ( end , | b | b . unix_timestamp_nanos ( ) ) ;
let is_within_dates = RangeBounds ::contains (
& ( start , end ) ,
& started_at_processing . unix_timestamp_nanos ( ) ,
) ;
if ! is_within_dates {
filtered_processing_tasks . clear ( ) ;
2022-10-24 19:32:46 +08:00
}
} ;
2022-10-25 15:48:51 +08:00
match ( query . after_started_at , query . before_started_at ) {
( None , None ) = > ( ) ,
( None , Some ( before ) ) = > {
clear_filtered_processing_tasks ( Bound ::Unbounded , Bound ::Excluded ( before ) )
2022-10-24 19:32:46 +08:00
}
2022-10-25 15:48:51 +08:00
( Some ( after ) , None ) = > {
clear_filtered_processing_tasks ( Bound ::Excluded ( after ) , Bound ::Unbounded )
}
( Some ( after ) , Some ( before ) ) = > {
clear_filtered_processing_tasks ( Bound ::Excluded ( after ) , Bound ::Excluded ( before ) )
}
} ;
2022-10-24 19:32:46 +08:00
keep_tasks_within_datetimes (
2022-10-27 19:00:30 +08:00
rtxn ,
2022-10-24 19:32:46 +08:00
& mut filtered_non_processing_tasks ,
self . started_at ,
query . after_started_at ,
query . before_started_at ,
) ? ;
filtered_non_processing_tasks | filtered_processing_tasks
} ;
2022-10-19 18:59:12 +08:00
keep_tasks_within_datetimes (
2022-10-27 19:00:30 +08:00
rtxn ,
2022-10-19 18:59:12 +08:00
& mut tasks ,
self . enqueued_at ,
query . after_enqueued_at ,
query . before_enqueued_at ,
) ? ;
keep_tasks_within_datetimes (
2022-10-27 19:00:30 +08:00
rtxn ,
2022-10-19 18:59:12 +08:00
& mut tasks ,
self . finished_at ,
query . after_finished_at ,
query . before_finished_at ,
) ? ;
2022-10-26 18:56:01 +08:00
if let Some ( limit ) = query . limit {
tasks = tasks . into_iter ( ) . rev ( ) . take ( limit as usize ) . collect ( ) ;
}
2022-10-13 17:09:00 +08:00
Ok ( tasks )
}
2023-06-06 18:28:27 +08:00
/// The returned structure contains:
/// 1. The name of the property being observed can be `statuses`, `types`, or `indexes`.
/// 2. The name of the specific data related to the property can be `enqueued` for the `statuses`, `settingsUpdate` for the `types`, or the name of the index for the `indexes`, for example.
/// 3. The number of times the properties appeared.
2023-05-25 18:58:13 +08:00
pub fn get_stats ( & self ) -> Result < BTreeMap < String , BTreeMap < String , u64 > > > {
let rtxn = self . read_txn ( ) ? ;
let mut res = BTreeMap ::new ( ) ;
2023-08-07 17:12:08 +08:00
let processing_tasks = { self . processing_tasks . read ( ) . unwrap ( ) . processing . len ( ) } ;
2023-05-25 18:58:13 +08:00
res . insert (
" statuses " . to_string ( ) ,
enum_iterator ::all ::< Status > ( )
2023-08-07 17:12:08 +08:00
. map ( | s | {
let tasks = self . get_status ( & rtxn , s ) ? . len ( ) ;
match s {
Status ::Enqueued = > Ok ( ( s . to_string ( ) , tasks - processing_tasks ) ) ,
Status ::Processing = > Ok ( ( s . to_string ( ) , processing_tasks ) ) ,
s = > Ok ( ( s . to_string ( ) , tasks ) ) ,
}
} )
2023-05-25 18:58:13 +08:00
. collect ::< Result < BTreeMap < String , u64 > > > ( ) ? ,
) ;
res . insert (
" types " . to_string ( ) ,
enum_iterator ::all ::< Kind > ( )
. map ( | s | Ok ( ( s . to_string ( ) , self . get_kind ( & rtxn , s ) ? . len ( ) ) ) )
. collect ::< Result < BTreeMap < String , u64 > > > ( ) ? ,
) ;
res . insert (
" indexes " . to_string ( ) ,
self . index_tasks
. iter ( & rtxn ) ?
. map ( | res | Ok ( res . map ( | ( name , bitmap ) | ( name . to_string ( ) , bitmap . len ( ) ) ) ? ) )
. collect ::< Result < BTreeMap < String , u64 > > > ( ) ? ,
) ;
Ok ( res )
}
2023-06-23 07:24:25 +08:00
// Return true if there is at least one task that is processing.
pub fn is_task_processing ( & self ) -> Result < bool > {
2023-06-30 11:28:18 +08:00
Ok ( ! self . processing_tasks . read ( ) . unwrap ( ) . processing . is_empty ( ) )
2023-06-23 07:24:25 +08:00
}
2022-10-27 17:17:50 +08:00
/// Return true iff there is at least one task associated with this index
/// that is processing.
pub fn is_index_processing ( & self , index : & str ) -> Result < bool > {
2022-10-13 19:04:49 +08:00
let rtxn = self . env . read_txn ( ) ? ;
2022-10-27 17:17:50 +08:00
let processing_tasks = self . processing_tasks . read ( ) . unwrap ( ) . processing . clone ( ) ;
let index_tasks = self . index_tasks ( & rtxn , index ) ? ;
let nbr_index_processing_tasks = processing_tasks . intersection_len ( & index_tasks ) ;
Ok ( nbr_index_processing_tasks > 0 )
}
2023-07-05 16:58:10 +08:00
/// Return the task ids matching the query along with the total number of tasks
/// by ignoring the from and limit parameters from the user's point of view.
2022-10-27 17:17:50 +08:00
///
/// There are two differences between an internal query and a query executed by
/// the user.
///
/// 1. IndexSwap tasks are not publicly associated with any index, but they are associated
/// with many indexes internally.
2022-10-27 22:23:50 +08:00
/// 2. The user may not have the rights to access the tasks (internally) associated with all indexes.
2022-10-27 17:17:50 +08:00
pub fn get_task_ids_from_authorized_indexes (
& self ,
rtxn : & RoTxn ,
query : & Query ,
2023-02-19 21:40:25 +08:00
filters : & meilisearch_auth ::AuthFilter ,
2023-07-05 16:58:10 +08:00
) -> Result < ( RoaringBitmap , u64 ) > {
// compute all tasks matching the filter by ignoring the limits, to find the number of tasks matching
// the filter.
// As this causes us to compute the filter twice it is slightly inefficient, but doing it this way spares
// us from modifying the underlying implementation, and the performance remains sufficient.
// Should this change, we would modify `get_task_ids` to directly return the number of matching tasks.
let total_tasks = self . get_task_ids ( rtxn , & query . clone ( ) . without_limits ( ) ) ? ;
2022-10-27 19:00:30 +08:00
let mut tasks = self . get_task_ids ( rtxn , query ) ? ;
2022-10-27 17:17:50 +08:00
2022-11-28 23:27:41 +08:00
// If the query contains a list of index uid or there is a finite list of authorized indexes,
// then we must exclude all the kinds that aren't associated to one and only one index.
2023-02-19 21:40:25 +08:00
if query . index_uids . is_some ( ) | | ! filters . all_indexes_authorized ( ) {
2022-10-28 00:00:04 +08:00
for kind in enum_iterator ::all ::< Kind > ( ) . filter ( | kind | ! kind . related_to_one_index ( ) ) {
tasks - = self . get_kind ( rtxn , kind ) ? ;
}
2022-10-27 17:17:50 +08:00
}
// Any task that is internally associated with a non-authorized index
// must be discarded.
2023-02-19 21:40:25 +08:00
if ! filters . all_indexes_authorized ( ) {
2022-10-27 17:17:50 +08:00
let all_indexes_iter = self . index_tasks . iter ( rtxn ) ? ;
2022-10-27 22:23:50 +08:00
for result in all_indexes_iter {
let ( index , index_tasks ) = result ? ;
2023-02-19 21:40:25 +08:00
if ! filters . is_index_authorized ( index ) {
2022-10-27 17:17:50 +08:00
tasks - = index_tasks ;
}
}
}
2023-07-05 16:58:10 +08:00
Ok ( ( tasks , total_tasks . len ( ) ) )
2022-10-27 17:17:50 +08:00
}
2023-07-05 16:58:10 +08:00
/// Return the tasks matching the query from the user's point of view along
/// with the total number of tasks matching the query, ignoring from and limit.
2022-10-27 17:17:50 +08:00
///
/// There are two differences between an internal query and a query executed by
/// the user.
///
/// 1. IndexSwap tasks are not publicly associated with any index, but they are associated
/// with many indexes internally.
2022-10-27 22:23:50 +08:00
/// 2. The user may not have the rights to access the tasks (internally) associated with all indexes.
2022-10-27 17:17:50 +08:00
pub fn get_tasks_from_authorized_indexes (
& self ,
query : Query ,
2023-02-19 21:40:25 +08:00
filters : & meilisearch_auth ::AuthFilter ,
2023-07-05 16:58:10 +08:00
) -> Result < ( Vec < Task > , u64 ) > {
2022-10-27 17:17:50 +08:00
let rtxn = self . env . read_txn ( ) ? ;
2023-07-05 16:58:10 +08:00
let ( tasks , total ) = self . get_task_ids_from_authorized_indexes ( & rtxn , & query , filters ) ? ;
2022-10-13 18:48:23 +08:00
let tasks = self . get_existing_tasks (
& rtxn ,
2022-10-21 00:00:07 +08:00
tasks . into_iter ( ) . rev ( ) . take ( query . limit . unwrap_or ( u32 ::MAX ) as usize ) ,
2022-10-13 18:48:23 +08:00
) ? ;
2022-10-21 00:00:07 +08:00
let ProcessingTasks { started_at , processing , .. } =
self . processing_tasks . read ( ) . map_err ( | _ | Error ::CorruptedTaskQueue ) ? . clone ( ) ;
2022-10-03 22:15:10 +08:00
2022-10-12 09:21:25 +08:00
let ret = tasks . into_iter ( ) ;
2022-10-03 22:15:10 +08:00
if processing . is_empty ( ) {
2023-07-05 16:58:10 +08:00
Ok ( ( ret . collect ( ) , total ) )
2022-10-03 22:15:10 +08:00
} else {
2023-07-05 16:58:10 +08:00
Ok ( (
ret . map ( | task | {
if processing . contains ( task . uid ) {
2022-10-21 00:00:07 +08:00
Task { status : Status ::Processing , started_at : Some ( started_at ) , .. task }
2023-07-05 16:58:10 +08:00
} else {
task
2022-10-21 00:00:07 +08:00
}
2022-10-03 22:15:10 +08:00
} )
2023-07-05 16:58:10 +08:00
. collect ( ) ,
total ,
) )
2022-10-03 22:15:10 +08:00
}
}
2022-10-20 16:25:34 +08:00
/// Register a new task in the scheduler.
///
/// If it fails and data was associated with the task, it tries to delete the associated data.
2024-02-21 18:21:26 +08:00
pub fn register (
& self ,
kind : KindWithContent ,
task_id : Option < TaskId > ,
dry_run : bool ,
) -> Result < Task > {
2022-10-03 22:15:10 +08:00
let mut wtxn = self . env . write_txn ( ) ? ;
2023-04-13 00:46:24 +08:00
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
2023-04-07 00:26:27 +08:00
if ! matches! ( & kind , KindWithContent ::TaskDeletion { tasks , .. } if ! tasks . is_empty ( ) )
2023-11-23 01:21:19 +08:00
& & ( self . env . non_free_pages_size ( ) ? * 100 ) / self . env . info ( ) . map_size as u64 > 50
2023-04-07 00:26:27 +08:00
{
return Err ( Error ::NoSpaceLeftInTaskQueue ) ;
}
2023-09-07 17:16:51 +08:00
let next_task_id = self . next_task_id ( & wtxn ) ? ;
if let Some ( uid ) = task_id {
if uid < next_task_id {
return Err ( Error ::BadTaskId { received : uid , expected : next_task_id } ) ;
}
}
2022-10-25 22:10:14 +08:00
let mut task = Task {
2023-09-07 17:16:51 +08:00
uid : task_id . unwrap_or ( next_task_id ) ,
2022-12-22 18:46:17 +08:00
enqueued_at : OffsetDateTime ::now_utc ( ) ,
2022-10-03 22:15:10 +08:00
started_at : None ,
finished_at : None ,
error : None ,
2022-10-18 19:57:58 +08:00
canceled_by : None ,
2022-10-17 23:19:17 +08:00
details : kind . default_details ( ) ,
2022-10-03 22:15:10 +08:00
status : Status ::Enqueued ,
2022-10-17 23:19:17 +08:00
kind : kind . clone ( ) ,
2022-10-03 22:15:10 +08:00
} ;
2022-10-25 22:10:14 +08:00
// For deletion and cancelation tasks, we want to make extra sure that they
// don't attempt to delete/cancel tasks that are newer than themselves.
filter_out_references_to_newer_tasks ( & mut task ) ;
2022-10-27 15:41:32 +08:00
// If the register task is an index swap task, verify that it is well-formed
// (that it does not contain duplicate indexes).
check_index_swap_validity ( & task ) ? ;
2024-02-21 18:21:26 +08:00
// At this point the task is going to be registered and no further checks will be done
if dry_run {
return Ok ( task ) ;
}
2022-10-25 22:10:14 +08:00
// Get rid of the mutability.
let task = task ;
2023-11-23 01:21:19 +08:00
self . all_tasks . put_with_flags ( & mut wtxn , PutFlags ::APPEND , & task . uid , & task ) ? ;
2022-10-03 22:15:10 +08:00
2022-10-25 16:26:51 +08:00
for index in task . indexes ( ) {
self . update_index ( & mut wtxn , index , | bitmap | {
bitmap . insert ( task . uid ) ;
} ) ? ;
2022-10-03 22:15:10 +08:00
}
self . update_status ( & mut wtxn , Status ::Enqueued , | bitmap | {
bitmap . insert ( task . uid ) ;
} ) ? ;
self . update_kind ( & mut wtxn , task . kind . as_kind ( ) , | bitmap | {
2022-10-26 23:31:23 +08:00
bitmap . insert ( task . uid ) ;
2022-10-03 22:15:10 +08:00
} ) ? ;
2022-10-19 18:59:12 +08:00
utils ::insert_task_datetime ( & mut wtxn , self . enqueued_at , task . enqueued_at , task . uid ) ? ;
2022-10-19 17:31:08 +08:00
if let Err ( e ) = wtxn . commit ( ) {
self . delete_persisted_task_data ( & task ) ? ;
return Err ( e . into ( ) ) ;
2022-10-03 22:15:10 +08:00
}
2022-10-17 23:19:17 +08:00
// If the registered task is a task cancelation
// we inform the processing tasks to stop (if necessary).
if let KindWithContent ::TaskCancelation { tasks , .. } = kind {
let tasks_to_cancel = RoaringBitmap ::from_iter ( tasks ) ;
2022-10-21 00:00:07 +08:00
if self . processing_tasks . read ( ) . unwrap ( ) . must_cancel_processing_tasks ( & tasks_to_cancel )
2022-10-19 17:22:59 +08:00
{
self . must_stop_processing . must_stop ( ) ;
}
2022-10-17 23:19:17 +08:00
}
2022-10-10 22:20:35 +08:00
// notify the scheduler loop to execute a new tick
self . wake_up . signal ( ) ;
2022-10-03 22:15:10 +08:00
2022-10-12 09:21:25 +08:00
Ok ( task )
2022-10-03 22:15:10 +08:00
}
2023-01-09 16:36:00 +08:00
/// Register a new task coming from a dump in the scheduler.
/// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running.
2023-03-29 20:27:40 +08:00
pub fn register_dumped_task ( & mut self ) -> Result < Dump > {
Dump ::new ( self )
2022-10-16 07:39:01 +08:00
}
/// Create a new index without any associated task.
2022-12-21 21:28:00 +08:00
pub fn create_raw_index (
& self ,
name : & str ,
2022-12-22 18:46:17 +08:00
date : Option < ( OffsetDateTime , OffsetDateTime ) > ,
2022-12-21 21:28:00 +08:00
) -> Result < Index > {
2022-10-26 20:19:56 +08:00
let wtxn = self . env . write_txn ( ) ? ;
2022-12-16 15:11:12 +08:00
let index = self . index_mapper . create_index ( wtxn , name , date ) ? ;
2022-10-16 09:14:01 +08:00
Ok ( index )
2022-10-16 07:39:01 +08:00
}
2022-10-20 16:25:34 +08:00
/// Create a file and register it in the index scheduler.
///
/// The returned file and uuid can be used to associate
/// some data to a task. The file will be kept until
/// the task has been fully processed.
2024-02-21 18:21:26 +08:00
pub fn create_update_file ( & self , dry_run : bool ) -> Result < ( Uuid , file_store ::File ) > {
if dry_run {
Ok ( ( Uuid ::nil ( ) , file_store ::File ::dry_file ( ) ? ) )
} else {
Ok ( self . file_store . new_update ( ) ? )
}
2022-10-03 22:15:10 +08:00
}
2022-10-17 19:11:12 +08:00
2022-10-10 21:51:28 +08:00
#[ cfg(test) ]
2022-10-16 07:39:01 +08:00
pub fn create_update_file_with_uuid ( & self , uuid : u128 ) -> Result < ( Uuid , file_store ::File ) > {
2022-10-11 15:55:03 +08:00
Ok ( self . file_store . new_update_with_uuid ( uuid ) ? )
2022-10-10 21:51:28 +08:00
}
2022-10-03 22:15:10 +08:00
2023-01-25 18:04:29 +08:00
/// The size on disk taken by all the updates files contained in the `IndexScheduler`, in bytes.
2023-01-25 18:20:15 +08:00
pub fn compute_update_file_size ( & self ) -> Result < u64 > {
Ok ( self . file_store . compute_total_size ( ) ? )
2023-01-24 23:17:23 +08:00
}
2022-10-20 16:25:34 +08:00
/// Delete a file from the index scheduler.
///
/// Counterpart to the [`create_update_file`](IndexScheduler::create_update_file) method.
2022-10-03 22:15:10 +08:00
pub fn delete_update_file ( & self , uuid : Uuid ) -> Result < ( ) > {
Ok ( self . file_store . delete ( uuid ) ? )
}
2022-10-20 16:25:34 +08:00
/// Perform one iteration of the run loop.
///
2023-04-25 02:04:50 +08:00
/// 1. See if we need to cleanup the task queue
/// 2. Find the next batch of tasks to be processed.
/// 3. Update the information of these tasks following the start of their processing.
/// 4. Update the in-memory list of processed tasks accordingly.
/// 5. Process the batch:
2022-10-20 16:25:34 +08:00
/// - perform the actions of each batched task
/// - update the information of each batched task following the end
/// of their processing.
2023-04-25 02:04:50 +08:00
/// 6. Reset the in-memory list of processed tasks.
2022-10-10 22:19:23 +08:00
///
/// Returns the number of processed tasks.
2023-01-10 02:30:29 +08:00
fn tick ( & self ) -> Result < TickOutcome > {
2022-10-03 22:15:10 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
{
* self . run_loop_iteration . write ( ) . unwrap ( ) + = 1 ;
self . breakpoint ( Breakpoint ::Start ) ;
}
2022-10-03 22:15:10 +08:00
2024-02-20 19:16:50 +08:00
if self . cleanup_enabled {
self . cleanup_task_queue ( ) ? ;
}
2023-04-25 02:04:50 +08:00
2022-10-24 20:16:14 +08:00
let rtxn = self . env . read_txn ( ) . map_err ( Error ::HeedTransaction ) ? ;
let batch =
match self . create_next_batch ( & rtxn ) . map_err ( | e | Error ::CreateBatch ( Box ::new ( e ) ) ) ? {
Some ( batch ) = > batch ,
2023-09-28 16:10:12 +08:00
None = > return Ok ( TickOutcome ::WaitForSignal ) ,
2022-10-24 20:16:14 +08:00
} ;
2023-01-09 16:36:00 +08:00
let index_uid = batch . index_uid ( ) . map ( ToOwned ::to_owned ) ;
2022-10-19 13:34:10 +08:00
drop ( rtxn ) ;
2022-10-03 22:15:10 +08:00
// 1. store the starting date with the bitmap of processing tasks.
2024-02-26 17:43:04 +08:00
let ids = batch . ids ( ) ;
2022-10-10 22:19:23 +08:00
let processed_tasks = ids . len ( ) ;
2022-10-03 22:15:10 +08:00
let started_at = OffsetDateTime ::now_utc ( ) ;
2022-10-19 17:22:59 +08:00
// We reset the must_stop flag to be sure that we don't stop processing tasks
self . must_stop_processing . reset ( ) ;
2024-02-26 17:43:04 +08:00
self . processing_tasks . write ( ) . unwrap ( ) . start_processing_at ( started_at , ids . clone ( ) ) ;
2022-10-03 22:15:10 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
self . breakpoint ( Breakpoint ::BatchCreated ) ;
2022-10-03 22:15:10 +08:00
// 2. Process the tasks
2022-10-20 23:11:44 +08:00
let res = {
let cloned_index_scheduler = self . private_clone ( ) ;
2022-11-28 23:27:41 +08:00
let handle = std ::thread ::Builder ::new ( )
. name ( String ::from ( " batch-operation " ) )
. spawn ( move | | cloned_index_scheduler . process_batch ( batch ) )
. unwrap ( ) ;
2022-10-25 16:53:29 +08:00
handle . join ( ) . unwrap_or ( Err ( Error ::ProcessBatchPanicked ) )
2022-10-20 23:11:44 +08:00
} ;
2023-11-10 17:50:19 +08:00
// Reset the currently updating index to relinquish the index handle
2024-01-09 22:37:27 +08:00
self . index_mapper . set_currently_updating_index ( None ) ;
2023-11-10 17:50:19 +08:00
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
self . maybe_fail ( tests ::FailureLocation ::AcquiringWtxn ) ? ;
2022-10-24 20:16:14 +08:00
let mut wtxn = self . env . write_txn ( ) . map_err ( Error ::HeedTransaction ) ? ;
2022-10-17 22:30:18 +08:00
2022-10-03 22:15:10 +08:00
let finished_at = OffsetDateTime ::now_utc ( ) ;
match res {
Ok ( tasks ) = > {
2022-10-24 20:16:14 +08:00
#[ cfg(test) ]
self . breakpoint ( Breakpoint ::ProcessBatchSucceeded ) ;
2022-11-28 23:27:41 +08:00
2024-02-22 21:56:22 +08:00
let mut success = 0 ;
let mut failure = 0 ;
2022-10-20 23:11:44 +08:00
#[ allow(unused_variables) ]
for ( i , mut task ) in tasks . into_iter ( ) . enumerate ( ) {
2022-10-03 22:15:10 +08:00
task . started_at = Some ( started_at ) ;
task . finished_at = Some ( finished_at ) ;
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
self . maybe_fail (
tests ::FailureLocation ::UpdatingTaskAfterProcessBatchSuccess {
task_uid : i as u32 ,
} ,
) ? ;
2024-02-22 21:56:22 +08:00
match task . error {
Some ( _ ) = > failure + = 1 ,
None = > success + = 1 ,
}
2022-10-24 20:16:14 +08:00
self . update_task ( & mut wtxn , & task )
. map_err ( | e | Error ::TaskDatabaseUpdate ( Box ::new ( e ) ) ) ? ;
2022-10-03 22:15:10 +08:00
}
2024-02-22 21:56:22 +08:00
tracing ::info! ( " A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks. " ) ;
2022-10-03 22:15:10 +08:00
}
2022-10-17 23:19:17 +08:00
// If we have an abortion error we must stop the tick here and re-schedule tasks.
Err ( Error ::Milli ( milli ::Error ::InternalError (
milli ::InternalError ::AbortedIndexation ,
2023-11-14 17:59:02 +08:00
) ) )
| Err ( Error ::AbortedTask ) = > {
2022-10-24 20:16:14 +08:00
#[ cfg(test) ]
self . breakpoint ( Breakpoint ::AbortedIndexation ) ;
2023-11-23 01:21:19 +08:00
wtxn . abort ( ) ;
2022-11-28 23:27:41 +08:00
2024-02-22 21:56:22 +08:00
tracing ::info! ( " A batch of tasks was aborted. " ) ;
2022-11-28 23:27:41 +08:00
// We make sure that we don't call `stop_processing` on the `processing_tasks`,
// this is because we want to let the next tick call `create_next_batch` and keep
// the `started_at` date times and `processings` of the current processing tasks.
// This date time is used by the task cancelation to store the right `started_at`
// date in the task on disk.
2023-01-10 02:30:29 +08:00
return Ok ( TickOutcome ::TickAgain ( 0 ) ) ;
2022-10-17 23:19:17 +08:00
}
2023-01-09 16:36:00 +08:00
// If an index said it was full, we need to:
// 1. identify which index is full
// 2. close the associated environment
// 3. resize it
// 4. re-schedule tasks
Err ( Error ::Milli ( milli ::Error ::UserError (
milli ::UserError ::MaxDatabaseSizeReached ,
) ) ) if index_uid . is_some ( ) = > {
// fixme: add index_uid to match to avoid the unwrap
let index_uid = index_uid . unwrap ( ) ;
// fixme: handle error more gracefully? not sure when this could happen
self . index_mapper . resize_index ( & wtxn , & index_uid ) ? ;
2023-11-23 01:21:19 +08:00
wtxn . abort ( ) ;
2023-01-09 16:36:00 +08:00
2024-02-22 21:56:22 +08:00
tracing ::info! ( " The max database size was reached. Resizing the index. " ) ;
2023-01-09 16:36:00 +08:00
return Ok ( TickOutcome ::TickAgain ( 0 ) ) ;
2022-10-17 23:19:17 +08:00
}
2022-10-03 22:15:10 +08:00
// In case of a failure we must get back and patch all the tasks with the error.
2022-10-05 22:48:43 +08:00
Err ( err ) = > {
2022-10-24 20:16:14 +08:00
#[ cfg(test) ]
self . breakpoint ( Breakpoint ::ProcessBatchFailed ) ;
2022-10-05 22:48:43 +08:00
let error : ResponseError = err . into ( ) ;
2024-02-22 21:56:22 +08:00
for id in ids . iter ( ) {
2022-10-24 20:16:14 +08:00
let mut task = self
. get_task ( & wtxn , id )
. map_err ( | e | Error ::TaskDatabaseUpdate ( Box ::new ( e ) ) ) ?
. ok_or ( Error ::CorruptedTaskQueue ) ? ;
2022-10-03 22:15:10 +08:00
task . started_at = Some ( started_at ) ;
task . finished_at = Some ( finished_at ) ;
task . status = Status ::Failed ;
2022-10-05 22:48:43 +08:00
task . error = Some ( error . clone ( ) ) ;
2022-11-28 23:27:41 +08:00
task . details = task . details . map ( | d | d . to_failed ( ) ) ;
2022-10-03 22:15:10 +08:00
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
self . maybe_fail ( tests ::FailureLocation ::UpdatingTaskAfterProcessBatchFailure ) ? ;
2024-02-22 21:56:22 +08:00
tracing ::info! ( " Batch failed {} " , error ) ;
2022-10-24 20:16:14 +08:00
self . update_task ( & mut wtxn , & task )
. map_err ( | e | Error ::TaskDatabaseUpdate ( Box ::new ( e ) ) ) ? ;
2022-10-03 22:15:10 +08:00
}
}
}
2022-10-26 23:31:23 +08:00
2023-11-27 22:11:22 +08:00
let processed = self . processing_tasks . write ( ) . unwrap ( ) . stop_processing ( ) ;
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
self . maybe_fail ( tests ::FailureLocation ::CommittingWtxn ) ? ;
2022-10-24 20:16:14 +08:00
wtxn . commit ( ) . map_err ( Error ::HeedTransaction ) ? ;
2022-10-03 22:15:10 +08:00
2024-03-28 01:26:47 +08:00
// Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart
tracing ::debug! ( " Deleting the update files " ) ;
2024-02-22 21:56:22 +08:00
2024-02-26 17:43:04 +08:00
//We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap
let idx = AtomicU32 ::new ( 0 ) ;
( 0 .. current_num_threads ( ) ) . into_par_iter ( ) . try_for_each ( | _ | -> Result < ( ) > {
2024-02-22 21:56:22 +08:00
let rtxn = self . read_txn ( ) ? ;
2024-02-26 17:43:04 +08:00
while let Some ( id ) = ids . select ( idx . fetch_add ( 1 , Ordering ::Relaxed ) ) {
let task = self
. get_task ( & rtxn , id )
. map_err ( | e | Error ::TaskDatabaseUpdate ( Box ::new ( e ) ) ) ?
. ok_or ( Error ::CorruptedTaskQueue ) ? ;
if let Err ( e ) = self . delete_persisted_task_data ( & task ) {
tracing ::error! (
" Failure to delete the content files associated with task {}. Error: {e} " ,
task . uid
) ;
}
2024-02-22 21:56:22 +08:00
}
Ok ( ( ) )
} ) ? ;
2023-11-27 22:11:22 +08:00
// We shouldn't crash the tick function if we can't send data to the webhook.
let _ = self . notify_webhook ( & processed ) ;
2022-10-03 22:15:10 +08:00
#[ cfg(test) ]
2022-10-20 23:11:44 +08:00
self . breakpoint ( Breakpoint ::AfterProcessing ) ;
2022-10-03 22:15:10 +08:00
2023-01-10 02:30:29 +08:00
Ok ( TickOutcome ::TickAgain ( processed_tasks ) )
2022-10-03 22:15:10 +08:00
}
2022-10-18 21:04:14 +08:00
2024-03-28 01:26:47 +08:00
/// Once the tasks changes have been committed we must send all the tasks that were updated to our webhook if there is one.
2023-11-27 22:11:22 +08:00
fn notify_webhook ( & self , updated : & RoaringBitmap ) -> Result < ( ) > {
if let Some ( ref url ) = self . webhook_url {
2023-11-29 21:27:50 +08:00
struct TaskReader < ' a , ' b > {
rtxn : & ' a RoTxn < ' a > ,
index_scheduler : & ' a IndexScheduler ,
tasks : & ' b mut roaring ::bitmap ::Iter < ' b > ,
buffer : Vec < u8 > ,
written : usize ,
}
2023-11-27 22:11:22 +08:00
2023-11-29 21:27:50 +08:00
impl < ' a , ' b > Read for TaskReader < ' a , ' b > {
fn read ( & mut self , mut buf : & mut [ u8 ] ) -> std ::io ::Result < usize > {
if self . buffer . is_empty ( ) {
match self . tasks . next ( ) {
None = > return Ok ( 0 ) ,
Some ( task_id ) = > {
let task = self
. index_scheduler
. get_task ( self . rtxn , task_id )
2023-11-29 21:51:47 +08:00
. map_err ( | err | io ::Error ::new ( io ::ErrorKind ::Other , err ) ) ?
. ok_or_else ( | | {
io ::Error ::new (
io ::ErrorKind ::Other ,
Error ::CorruptedTaskQueue ,
)
} ) ? ;
2023-11-29 21:27:50 +08:00
serde_json ::to_writer (
& mut self . buffer ,
& TaskView ::from_task ( & task ) ,
) ? ;
self . buffer . push ( b '\n' ) ;
}
}
}
let mut to_write = & self . buffer [ self . written .. ] ;
let wrote = io ::copy ( & mut to_write , & mut buf ) ? ;
self . written + = wrote as usize ;
2023-11-27 22:11:22 +08:00
2023-11-29 21:27:50 +08:00
// we wrote everything and must refresh our buffer on the next call
if self . written = = self . buffer . len ( ) {
self . written = 0 ;
self . buffer . clear ( ) ;
}
Ok ( wrote as usize )
}
2023-11-27 22:11:22 +08:00
}
2023-11-29 21:27:50 +08:00
let rtxn = self . env . read_txn ( ) ? ;
let task_reader = TaskReader {
rtxn : & rtxn ,
index_scheduler : self ,
tasks : & mut updated . into_iter ( ) ,
buffer : Vec ::with_capacity ( 50 ) , // on average a task is around ~100 bytes
written : 0 ,
} ;
2023-12-19 19:18:45 +08:00
// let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
2023-11-29 21:27:50 +08:00
let reader = GzEncoder ::new ( BufReader ::new ( task_reader ) , Compression ::default ( ) ) ;
2024-02-28 22:53:01 +08:00
let request = ureq ::post ( url )
2024-03-19 21:53:50 +08:00
. timeout ( Duration ::from_secs ( 30 ) )
2024-02-28 22:53:01 +08:00
. set ( " Content-Encoding " , " gzip " )
. set ( " Content-Type " , " application/x-ndjson " ) ;
2023-12-19 19:18:45 +08:00
let request = match & self . webhook_authorization_header {
Some ( header ) = > request . set ( " Authorization " , header ) ,
None = > request ,
} ;
if let Err ( e ) = request . send ( reader ) {
2024-02-07 22:51:38 +08:00
tracing ::error! ( " While sending data to the webhook: {e} " ) ;
2023-11-29 20:09:04 +08:00
}
2023-11-27 22:11:22 +08:00
}
Ok ( ( ) )
}
2023-04-25 02:04:50 +08:00
/// Register a task to cleanup the task queue if needed
fn cleanup_task_queue ( & self ) -> Result < ( ) > {
2023-04-25 23:26:34 +08:00
let rtxn = self . env . read_txn ( ) . map_err ( Error ::HeedTransaction ) ? ;
let nb_tasks = self . all_task_ids ( & rtxn ) ? . len ( ) ;
// if we have less than 1M tasks everything is fine
if nb_tasks < self . max_number_of_tasks as u64 {
2023-04-25 02:04:50 +08:00
return Ok ( ( ) ) ;
}
let finished = self . status . get ( & rtxn , & Status ::Succeeded ) ? . unwrap_or_default ( )
| self . status . get ( & rtxn , & Status ::Failed ) ? . unwrap_or_default ( )
| self . status . get ( & rtxn , & Status ::Canceled ) ? . unwrap_or_default ( ) ;
2023-04-25 23:26:34 +08:00
let to_delete = RoaringBitmap ::from_iter ( finished . into_iter ( ) . rev ( ) . take ( 100_000 ) ) ;
2023-04-25 02:04:50 +08:00
// /!\ the len must be at least 2 or else we might enter an infinite loop where we only delete
// the deletion tasks we enqueued ourselves.
if to_delete . len ( ) < 2 {
2024-02-07 22:51:38 +08:00
tracing ::warn! ( " The task queue is almost full, but no task can be deleted yet. " ) ;
2023-04-25 02:04:50 +08:00
// the only thing we can do is hope that the user tasks are going to finish
return Ok ( ( ) ) ;
}
2024-02-07 22:51:38 +08:00
tracing ::info! (
2023-04-26 18:02:06 +08:00
" The task queue is almost full. Deleting the oldest {} finished tasks. " ,
2023-04-25 19:11:58 +08:00
to_delete . len ( )
) ;
2023-04-26 19:55:02 +08:00
// it's safe to unwrap here because we checked the len above
let newest_task_id = to_delete . iter ( ) . last ( ) . unwrap ( ) ;
2023-05-04 15:56:48 +08:00
let last_task_to_delete =
self . get_task ( & rtxn , newest_task_id ) ? . ok_or ( Error ::CorruptedTaskQueue ) ? ;
2023-04-26 19:55:02 +08:00
drop ( rtxn ) ;
2023-05-04 15:56:48 +08:00
// increase time by one nanosecond so that the enqueuedAt of the last task to delete is also lower than that date.
let delete_before = last_task_to_delete . enqueued_at + Duration ::from_nanos ( 1 ) ;
2023-09-07 17:16:51 +08:00
self . register (
KindWithContent ::TaskDeletion {
query : format ! (
" ?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled " ,
delete_before . format ( & Rfc3339 ) . map_err ( | _ | Error ::CorruptedTaskQueue ) ? ,
) ,
tasks : to_delete ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
) ? ;
2023-04-25 02:04:50 +08:00
Ok ( ( ) )
}
2023-02-24 02:31:57 +08:00
pub fn index_stats ( & self , index_uid : & str ) -> Result < IndexStats > {
let is_indexing = self . is_index_processing ( index_uid ) ? ;
let rtxn = self . read_txn ( ) ? ;
let index_stats = self . index_mapper . stats_of ( & rtxn , index_uid ) ? ;
Ok ( IndexStats { is_indexing , inner_stats : index_stats } )
}
2023-10-23 16:38:56 +08:00
pub fn features ( & self ) -> RoFeatures {
2023-10-20 03:45:57 +08:00
self . features . features ( )
2023-06-23 04:56:44 +08:00
}
pub fn put_runtime_features ( & self , features : RuntimeTogglableFeatures ) -> Result < ( ) > {
let wtxn = self . env . write_txn ( ) . map_err ( Error ::HeedTransaction ) ? ;
self . features . put_runtime_features ( wtxn , features ) ? ;
Ok ( ( ) )
}
2022-10-18 21:04:14 +08:00
pub ( crate ) fn delete_persisted_task_data ( & self , task : & Task ) -> Result < ( ) > {
2022-10-19 17:33:05 +08:00
match task . content_uuid ( ) {
2022-10-25 20:09:01 +08:00
Some ( content_file ) = > self . delete_update_file ( content_file ) ,
2022-10-19 17:33:05 +08:00
None = > Ok ( ( ) ) ,
2022-10-18 21:04:14 +08:00
}
}
2022-10-20 23:11:44 +08:00
2023-11-15 22:46:37 +08:00
// TODO: consider using a type alias or a struct embedder/template
pub fn embedders (
& self ,
2024-05-22 21:27:09 +08:00
embedding_configs : Vec < ( String , milli ::vector ::EmbeddingConfig , RoaringBitmap ) > ,
2023-12-13 22:38:44 +08:00
) -> Result < EmbeddingConfigs > {
2023-11-15 22:46:37 +08:00
let res : Result < _ > = embedding_configs
. into_iter ( )
2024-05-22 21:27:09 +08:00
. map ( | ( name , milli ::vector ::EmbeddingConfig { embedder_options , prompt } , _ ) | {
2023-11-15 22:46:37 +08:00
let prompt =
Arc ::new ( prompt . try_into ( ) . map_err ( meilisearch_types ::milli ::Error ::from ) ? ) ;
// optimistically return existing embedder
{
let embedders = self . embedders . read ( ) . unwrap ( ) ;
if let Some ( embedder ) = embedders . get ( & embedder_options ) {
return Ok ( ( name , ( embedder . clone ( ) , prompt ) ) ) ;
}
}
// add missing embedder
let embedder = Arc ::new (
Embedder ::new ( embedder_options . clone ( ) )
. map_err ( meilisearch_types ::milli ::vector ::Error ::from )
. map_err ( meilisearch_types ::milli ::Error ::from ) ? ,
) ;
{
let mut embedders = self . embedders . write ( ) . unwrap ( ) ;
embedders . insert ( embedder_options , embedder . clone ( ) ) ;
}
Ok ( ( name , ( embedder , prompt ) ) )
} )
. collect ( ) ;
2023-12-13 22:38:44 +08:00
res . map ( EmbeddingConfigs ::new )
2023-11-15 22:46:37 +08:00
}
2022-10-25 15:48:51 +08:00
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
///
/// Two messages are sent through the channel for each breakpoint.
/// The first message is `(b, false)` and the second message is `(b, true)`.
///
/// Since the channel has a capacity of zero, the `send` and `recv` calls wait for each other.
/// So when the index scheduler calls `test_breakpoint_sdr.send(b, false)`, it blocks
/// the thread until the test catches up by calling `test_breakpoint_rcv.recv()` enough.
/// From the test side, we call `recv()` repeatedly until we find the message `(breakpoint, false)`.
/// As soon as we find it, the index scheduler is unblocked but then wait again on the call to
/// `test_breakpoint_sdr.send(b, true)`. This message will only be able to send once the
/// test asks to progress to the next `(b2, false)`.
2022-10-20 23:11:44 +08:00
#[ cfg(test) ]
fn breakpoint ( & self , b : Breakpoint ) {
// We send two messages. The first one will sync with the call
// to `handle.wait_until(b)`. The second one will block until the
// the next call to `handle.wait_until(..)`.
self . test_breakpoint_sdr . send ( ( b , false ) ) . unwrap ( ) ;
// This one will only be able to be sent if the test handle stays alive.
// If it fails, then it means that we have exited the test.
// By crashing with `unwrap`, we kill the run loop.
self . test_breakpoint_sdr . send ( ( b , true ) ) . unwrap ( ) ;
}
2022-10-03 22:15:10 +08:00
}
2023-03-29 20:27:40 +08:00
pub struct Dump < ' a > {
index_scheduler : & ' a IndexScheduler ,
2023-11-23 01:21:19 +08:00
wtxn : RwTxn < ' a > ,
2023-03-29 20:44:15 +08:00
indexes : HashMap < String , RoaringBitmap > ,
statuses : HashMap < Status , RoaringBitmap > ,
kinds : HashMap < Kind , RoaringBitmap > ,
2023-03-29 20:27:40 +08:00
}
impl < ' a > Dump < ' a > {
pub ( crate ) fn new ( index_scheduler : & ' a mut IndexScheduler ) -> Result < Self > {
// While loading a dump no one should be able to access the scheduler thus I can block everything.
let wtxn = index_scheduler . env . write_txn ( ) ? ;
2023-03-29 20:44:15 +08:00
Ok ( Dump {
index_scheduler ,
wtxn ,
indexes : HashMap ::new ( ) ,
statuses : HashMap ::new ( ) ,
kinds : HashMap ::new ( ) ,
} )
2023-03-29 20:27:40 +08:00
}
/// Register a new task coming from a dump in the scheduler.
/// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running.
pub fn register_dumped_task (
& mut self ,
task : TaskDump ,
content_file : Option < Box < UpdateFile > > ,
) -> Result < Task > {
let content_uuid = match content_file {
Some ( content_file ) if task . status = = Status ::Enqueued = > {
2024-02-21 18:21:26 +08:00
let ( uuid , mut file ) = self . index_scheduler . create_update_file ( false ) ? ;
2024-02-23 01:42:12 +08:00
let mut builder = DocumentsBatchBuilder ::new ( & mut file ) ;
2023-03-29 20:27:40 +08:00
for doc in content_file {
builder . append_json_object ( & doc ? ) ? ;
}
builder . into_inner ( ) ? ;
file . persist ( ) ? ;
Some ( uuid )
}
// If the task isn't `Enqueued` then just generate a recognisable `Uuid`
// in case we try to open it later.
_ if task . status ! = Status ::Enqueued = > Some ( Uuid ::nil ( ) ) ,
_ = > None ,
} ;
let task = Task {
uid : task . uid ,
enqueued_at : task . enqueued_at ,
started_at : task . started_at ,
finished_at : task . finished_at ,
error : task . error ,
canceled_by : task . canceled_by ,
details : task . details ,
status : task . status ,
kind : match task . kind {
KindDump ::DocumentImport {
primary_key ,
method ,
documents_count ,
allow_index_creation ,
} = > KindWithContent ::DocumentAdditionOrUpdate {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
primary_key ,
method ,
content_file : content_uuid . ok_or ( Error ::CorruptedDump ) ? ,
documents_count ,
allow_index_creation ,
} ,
KindDump ::DocumentDeletion { documents_ids } = > KindWithContent ::DocumentDeletion {
documents_ids ,
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
} ,
2023-03-07 17:02:04 +08:00
KindDump ::DocumentDeletionByFilter { filter } = > {
KindWithContent ::DocumentDeletionByFilter {
2023-05-03 04:36:56 +08:00
filter_expr : filter ,
2023-03-07 17:02:04 +08:00
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
}
}
2023-03-29 20:27:40 +08:00
KindDump ::DocumentClear = > KindWithContent ::DocumentClear {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
} ,
KindDump ::Settings { settings , is_deletion , allow_index_creation } = > {
KindWithContent ::SettingsUpdate {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
new_settings : settings ,
is_deletion ,
allow_index_creation ,
}
}
KindDump ::IndexDeletion = > KindWithContent ::IndexDeletion {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
} ,
KindDump ::IndexCreation { primary_key } = > KindWithContent ::IndexCreation {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
primary_key ,
} ,
KindDump ::IndexUpdate { primary_key } = > KindWithContent ::IndexUpdate {
index_uid : task . index_uid . ok_or ( Error ::CorruptedDump ) ? ,
primary_key ,
} ,
KindDump ::IndexSwap { swaps } = > KindWithContent ::IndexSwap { swaps } ,
KindDump ::TaskCancelation { query , tasks } = > {
KindWithContent ::TaskCancelation { query , tasks }
}
KindDump ::TasksDeletion { query , tasks } = > {
KindWithContent ::TaskDeletion { query , tasks }
}
KindDump ::DumpCreation { keys , instance_uid } = > {
KindWithContent ::DumpCreation { keys , instance_uid }
}
KindDump ::SnapshotCreation = > KindWithContent ::SnapshotCreation ,
} ,
} ;
2023-11-23 01:21:19 +08:00
self . index_scheduler . all_tasks . put ( & mut self . wtxn , & task . uid , & task ) ? ;
2023-03-29 20:27:40 +08:00
for index in task . indexes ( ) {
2023-03-29 20:44:15 +08:00
match self . indexes . get_mut ( index ) {
Some ( bitmap ) = > {
bitmap . insert ( task . uid ) ;
}
None = > {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( task . uid ) ;
self . indexes . insert ( index . to_string ( ) , bitmap ) ;
}
} ;
2023-03-29 20:27:40 +08:00
}
2023-04-05 17:32:14 +08:00
utils ::insert_task_datetime (
& mut self . wtxn ,
self . index_scheduler . enqueued_at ,
task . enqueued_at ,
task . uid ,
) ? ;
// we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change
if matches! ( task . status , Status ::Succeeded | Status ::Failed | Status ::Canceled ) {
if let Some ( started_at ) = task . started_at {
utils ::insert_task_datetime (
& mut self . wtxn ,
self . index_scheduler . started_at ,
started_at ,
task . uid ,
) ? ;
}
if let Some ( finished_at ) = task . finished_at {
utils ::insert_task_datetime (
& mut self . wtxn ,
self . index_scheduler . finished_at ,
finished_at ,
task . uid ,
) ? ;
}
}
2023-11-23 19:20:44 +08:00
self . statuses . entry ( task . status ) . or_default ( ) . insert ( task . uid ) ;
self . kinds . entry ( task . kind . as_kind ( ) ) . or_default ( ) . insert ( task . uid ) ;
2023-03-29 20:27:40 +08:00
Ok ( task )
}
/// Commit all the changes and exit the importing dump state
2023-03-29 20:44:15 +08:00
pub fn finish ( mut self ) -> Result < ( ) > {
for ( index , bitmap ) in self . indexes {
self . index_scheduler . index_tasks . put ( & mut self . wtxn , & index , & bitmap ) ? ;
}
for ( status , bitmap ) in self . statuses {
self . index_scheduler . put_status ( & mut self . wtxn , status , & bitmap ) ? ;
}
for ( kind , bitmap ) in self . kinds {
self . index_scheduler . put_kind ( & mut self . wtxn , kind , & bitmap ) ? ;
}
2023-03-29 20:27:40 +08:00
self . wtxn . commit ( ) ? ;
self . index_scheduler . wake_up . signal ( ) ;
2023-03-29 20:44:15 +08:00
2023-03-29 20:27:40 +08:00
Ok ( ( ) )
}
}
2023-01-10 02:30:29 +08:00
/// The outcome of calling the [`IndexScheduler::tick`] function.
pub enum TickOutcome {
/// The scheduler should immediately attempt another `tick`.
///
/// The `usize` field contains the number of processed tasks.
2024-02-26 17:43:04 +08:00
TickAgain ( u64 ) ,
2023-01-10 02:30:29 +08:00
/// The scheduler should wait for an external signal before attempting another `tick`.
WaitForSignal ,
}
2023-02-15 19:30:46 +08:00
/// How many indexes we can afford to have open simultaneously.
struct IndexBudget {
/// Map size of an index.
map_size : usize ,
/// Maximum number of simultaneously opened indexes.
index_count : usize ,
/// For very constrained systems we might need to reduce the base task_db_size so we can accept at least one index.
task_db_size : usize ,
}
2023-02-28 22:24:31 +08:00
/// The statistics that can be computed from an `Index` object and the scheduler.
///
/// Compared with `index_mapper::IndexStats`, it adds the scheduling status.
2023-02-24 02:31:57 +08:00
#[ derive(Debug) ]
pub struct IndexStats {
2023-02-28 22:24:31 +08:00
/// Whether this index is currently performing indexation, according to the scheduler.
2023-02-24 02:31:57 +08:00
pub is_indexing : bool ,
2023-02-28 22:24:31 +08:00
/// Internal stats computed from the index.
2023-02-24 02:31:57 +08:00
pub inner_stats : index_mapper ::IndexStats ,
}
2022-09-15 18:23:41 +08:00
#[ cfg(test) ]
mod tests {
2024-02-23 01:42:12 +08:00
use std ::io ::{ BufWriter , Write } ;
2022-10-24 20:16:14 +08:00
use std ::time ::Instant ;
2022-10-03 22:15:10 +08:00
use big_s ::S ;
2022-11-28 23:27:41 +08:00
use crossbeam ::channel ::RecvTimeoutError ;
2022-10-18 01:24:06 +08:00
use file_store ::File ;
2024-05-20 16:23:12 +08:00
use insta ::assert_json_snapshot ;
2023-04-26 19:55:02 +08:00
use meili_snap ::{ json_string , snapshot } ;
2023-02-19 21:40:25 +08:00
use meilisearch_auth ::AuthFilter ;
2022-11-30 00:03:22 +08:00
use meilisearch_types ::document_formats ::DocumentFormatError ;
2023-04-25 02:04:50 +08:00
use meilisearch_types ::error ::ErrorCode ;
2023-02-19 21:40:25 +08:00
use meilisearch_types ::index_uid_pattern ::IndexUidPattern ;
2022-10-24 23:29:17 +08:00
use meilisearch_types ::milli ::obkv_to_json ;
use meilisearch_types ::milli ::update ::IndexDocumentsMethod ::{
ReplaceDocuments , UpdateDocuments ,
} ;
2024-05-22 21:27:09 +08:00
use meilisearch_types ::milli ::update ::Setting ;
use meilisearch_types ::milli ::vector ::settings ::EmbeddingSettings ;
use meilisearch_types ::settings ::{ Checked , Unchecked } ;
2022-10-26 18:57:29 +08:00
use meilisearch_types ::tasks ::IndexSwap ;
2022-10-25 21:51:15 +08:00
use meilisearch_types ::VERSION_FILE_NAME ;
2022-11-30 17:59:06 +08:00
use tempfile ::{ NamedTempFile , TempDir } ;
2022-10-24 19:32:46 +08:00
use time ::Duration ;
2022-10-03 22:15:10 +08:00
use uuid ::Uuid ;
2022-11-28 23:27:41 +08:00
use Breakpoint ::* ;
2022-10-03 22:15:10 +08:00
use super ::* ;
2022-10-25 16:23:14 +08:00
use crate ::insta_snapshot ::{ snapshot_bitmap , snapshot_index_scheduler } ;
2022-10-20 23:11:44 +08:00
#[ derive(Debug, Clone, Copy, PartialEq, Eq) ]
pub enum FailureLocation {
InsideCreateBatch ,
InsideProcessBatch ,
PanicInsideProcessBatch ,
AcquiringWtxn ,
UpdatingTaskAfterProcessBatchSuccess { task_uid : u32 } ,
UpdatingTaskAfterProcessBatchFailure ,
CommittingWtxn ,
}
impl IndexScheduler {
pub fn test (
2022-10-26 17:41:59 +08:00
autobatching_enabled : bool ,
2022-10-20 23:11:44 +08:00
planned_failures : Vec < ( usize , FailureLocation ) > ,
2023-04-25 23:26:34 +08:00
) -> ( Self , IndexSchedulerHandle ) {
Self ::test_with_custom_config ( planned_failures , | config | {
config . autobatching_enabled = autobatching_enabled ;
} )
}
pub fn test_with_custom_config (
planned_failures : Vec < ( usize , FailureLocation ) > ,
configuration : impl Fn ( & mut IndexSchedulerOptions ) ,
2022-10-20 23:11:44 +08:00
) -> ( Self , IndexSchedulerHandle ) {
let tempdir = TempDir ::new ( ) . unwrap ( ) ;
let ( sender , receiver ) = crossbeam ::channel ::bounded ( 0 ) ;
2023-02-15 19:31:14 +08:00
let indexer_config = IndexerConfig { skip_index_budget : true , .. Default ::default ( ) } ;
2023-04-25 23:26:34 +08:00
let mut options = IndexSchedulerOptions {
2022-10-26 17:41:59 +08:00
version_file_path : tempdir . path ( ) . join ( VERSION_FILE_NAME ) ,
auth_path : tempdir . path ( ) . join ( " auth " ) ,
tasks_path : tempdir . path ( ) . join ( " db_path " ) ,
update_file_path : tempdir . path ( ) . join ( " file_store " ) ,
indexes_path : tempdir . path ( ) . join ( " indexes " ) ,
snapshots_path : tempdir . path ( ) . join ( " snapshots " ) ,
dumps_path : tempdir . path ( ) . join ( " dumps " ) ,
2023-11-27 22:11:22 +08:00
webhook_url : None ,
2023-12-19 19:18:45 +08:00
webhook_authorization_header : None ,
2022-11-29 22:26:24 +08:00
task_db_size : 1000 * 1000 , // 1 MB, we don't use MiB on purpose.
2023-01-12 00:34:46 +08:00
index_base_map_size : 1000 * 1000 , // 1 MB, we don't use MiB on purpose.
2023-05-15 17:23:58 +08:00
enable_mdb_writemap : false ,
2023-01-12 00:34:46 +08:00
index_growth_amount : 1000 * 1000 , // 1 MB
index_count : 5 ,
2023-02-15 19:31:14 +08:00
indexer_config ,
2023-04-25 23:26:34 +08:00
autobatching_enabled : true ,
2024-02-20 19:16:50 +08:00
cleanup_enabled : true ,
2023-04-25 23:26:34 +08:00
max_number_of_tasks : 1_000_000 ,
2023-12-11 23:08:39 +08:00
max_number_of_batched_tasks : usize ::MAX ,
2023-06-23 04:56:44 +08:00
instance_features : Default ::default ( ) ,
2022-10-26 17:41:59 +08:00
} ;
2023-04-25 23:26:34 +08:00
configuration ( & mut options ) ;
2022-10-26 17:41:59 +08:00
let index_scheduler = Self ::new ( options , sender , planned_failures ) . unwrap ( ) ;
2022-10-20 23:11:44 +08:00
2022-11-28 23:27:41 +08:00
// To be 100% consistent between all test we're going to start the scheduler right now
// and ensure it's in the expected starting state.
2024-05-21 20:59:08 +08:00
let breakpoint = match receiver . recv_timeout ( std ::time ::Duration ::from_secs ( 10 ) ) {
2022-11-28 23:27:41 +08:00
Ok ( b ) = > b ,
Err ( RecvTimeoutError ::Timeout ) = > {
panic! ( " The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint. " )
}
Err ( RecvTimeoutError ::Disconnected ) = > panic! ( " The scheduler crashed. " ) ,
} ;
assert_eq! ( breakpoint , ( Init , false ) ) ;
let index_scheduler_handle = IndexSchedulerHandle {
_tempdir : tempdir ,
test_breakpoint_rcv : receiver ,
last_breakpoint : breakpoint . 0 ,
} ;
2022-10-20 23:11:44 +08:00
( index_scheduler , index_scheduler_handle )
}
2023-04-04 03:08:47 +08:00
/// Return a [`PlannedFailure`](Error::PlannedFailure) error if a failure is planned
2022-10-20 23:11:44 +08:00
/// for the given location and current run loop iteration.
pub fn maybe_fail ( & self , location : FailureLocation ) -> Result < ( ) > {
if self . planned_failures . contains ( & ( * self . run_loop_iteration . read ( ) . unwrap ( ) , location ) )
{
match location {
FailureLocation ::PanicInsideProcessBatch = > {
panic! ( " simulated panic " )
}
2023-04-04 03:08:47 +08:00
_ = > Err ( Error ::PlannedFailure ) ,
2022-10-20 23:11:44 +08:00
}
} else {
Ok ( ( ) )
}
}
}
2022-10-03 22:15:10 +08:00
2022-10-10 23:02:28 +08:00
/// Return a `KindWithContent::IndexCreation` task
fn index_creation_task ( index : & 'static str , primary_key : & 'static str ) -> KindWithContent {
2022-10-21 00:00:07 +08:00
KindWithContent ::IndexCreation { index_uid : S ( index ) , primary_key : Some ( S ( primary_key ) ) }
2022-10-10 23:02:28 +08:00
}
/// Create a `KindWithContent::DocumentImport` task that imports documents.
///
/// - `index_uid` is given as parameter
/// - `primary_key` is given as parameter
/// - `method` is set to `ReplaceDocuments`
/// - `content_file` is given as parameter
/// - `documents_count` is given as parameter
/// - `allow_index_creation` is set to `true`
fn replace_document_import_task (
index : & 'static str ,
primary_key : Option < & 'static str > ,
content_file_uuid : u128 ,
documents_count : u64 ,
) -> KindWithContent {
2022-10-22 00:03:10 +08:00
KindWithContent ::DocumentAdditionOrUpdate {
2022-10-10 23:02:28 +08:00
index_uid : S ( index ) ,
primary_key : primary_key . map ( ToOwned ::to_owned ) ,
method : ReplaceDocuments ,
content_file : Uuid ::from_u128 ( content_file_uuid ) ,
2022-10-22 22:35:42 +08:00
documents_count ,
2022-10-10 23:02:28 +08:00
allow_index_creation : true ,
}
}
2022-11-30 00:03:22 +08:00
/// Adapting to the new json reading interface
2022-11-30 17:59:06 +08:00
pub fn read_json (
bytes : & [ u8 ] ,
2024-02-23 01:42:12 +08:00
write : impl Write ,
2022-12-13 22:10:51 +08:00
) -> std ::result ::Result < u64 , DocumentFormatError > {
2022-11-30 00:03:22 +08:00
let temp_file = NamedTempFile ::new ( ) . unwrap ( ) ;
let mut buffer = BufWriter ::new ( temp_file . reopen ( ) . unwrap ( ) ) ;
2022-12-03 22:48:38 +08:00
buffer . write_all ( bytes ) . unwrap ( ) ;
2022-11-30 00:03:22 +08:00
buffer . flush ( ) . unwrap ( ) ;
meilisearch_types ::document_formats ::read_json ( temp_file . as_file ( ) , write )
}
2022-10-10 23:02:28 +08:00
/// Create an update file with the given file uuid.
///
/// The update file contains just one simple document whose id is given by `document_id`.
///
/// The uuid of the file and its documents count is returned.
fn sample_documents (
index_scheduler : & IndexScheduler ,
file_uuid : u128 ,
document_id : usize ,
) -> ( File , u64 ) {
let content = format! (
r #"
{ {
" id " : " {document_id} "
} } " #
) ;
2022-10-21 00:00:07 +08:00
let ( _uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( file_uuid ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-10 23:02:28 +08:00
( file , documents_count )
}
2022-10-03 22:15:10 +08:00
pub struct IndexSchedulerHandle {
_tempdir : TempDir ,
2022-10-20 23:11:44 +08:00
test_breakpoint_rcv : crossbeam ::channel ::Receiver < ( Breakpoint , bool ) > ,
2022-11-28 23:27:41 +08:00
last_breakpoint : Breakpoint ,
2022-10-03 22:15:10 +08:00
}
impl IndexSchedulerHandle {
2022-11-28 23:27:41 +08:00
/// Advance the scheduler to the next tick.
/// Panic
/// * If the scheduler is waiting for a task to be registered.
/// * If the breakpoint queue is in a bad state.
#[ track_caller ]
fn advance ( & mut self ) -> Breakpoint {
let ( breakpoint_1 , b ) = match self
. test_breakpoint_rcv
2024-05-21 20:59:08 +08:00
. recv_timeout ( std ::time ::Duration ::from_secs ( 50 ) )
2022-11-28 23:27:41 +08:00
{
Ok ( b ) = > b ,
Err ( RecvTimeoutError ::Timeout ) = > {
panic! ( " The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint. " )
}
Err ( RecvTimeoutError ::Disconnected ) = > panic! ( " The scheduler crashed. " ) ,
} ;
// if we've already encountered a breakpoint we're supposed to be stuck on the false
// and we expect the same variant with the true to come now.
assert_eq! (
( breakpoint_1 , b ) ,
( self . last_breakpoint , true ) ,
" Internal error in the test suite. In the previous iteration I got `({:?}, false)` and now I got `({:?}, {:?})`. " ,
self . last_breakpoint ,
breakpoint_1 ,
b ,
) ;
let ( breakpoint_2 , b ) = match self
. test_breakpoint_rcv
2024-05-21 20:59:08 +08:00
. recv_timeout ( std ::time ::Duration ::from_secs ( 50 ) )
2022-11-28 23:27:41 +08:00
{
Ok ( b ) = > b ,
Err ( RecvTimeoutError ::Timeout ) = > {
panic! ( " The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint. " )
}
Err ( RecvTimeoutError ::Disconnected ) = > panic! ( " The scheduler crashed. " ) ,
} ;
assert! ( ! b , " Found the breakpoint handle in a bad state. Check your test suite " ) ;
self . last_breakpoint = breakpoint_2 ;
breakpoint_2
2022-10-03 22:15:10 +08:00
}
2022-10-24 23:29:17 +08:00
2022-11-28 23:27:41 +08:00
/// Advance the scheduler until all the provided breakpoints are reached in order.
#[ track_caller ]
fn advance_till ( & mut self , breakpoints : impl IntoIterator < Item = Breakpoint > ) {
for breakpoint in breakpoints {
let b = self . advance ( ) ;
assert_eq! (
b , breakpoint ,
" Was expecting the breakpoint `{:?}` but instead got `{:?}`. " ,
breakpoint , b
) ;
}
}
/// Wait for `n` successful batches.
#[ track_caller ]
fn advance_n_successful_batches ( & mut self , n : usize ) {
2022-10-24 23:29:17 +08:00
for _ in 0 .. n {
2022-11-28 23:27:41 +08:00
self . advance_one_successful_batch ( ) ;
}
}
/// Wait for `n` failed batches.
#[ track_caller ]
fn advance_n_failed_batches ( & mut self , n : usize ) {
for _ in 0 .. n {
self . advance_one_failed_batch ( ) ;
}
}
// Wait for one successful batch.
#[ track_caller ]
fn advance_one_successful_batch ( & mut self ) {
self . advance_till ( [ Start , BatchCreated ] ) ;
loop {
match self . advance ( ) {
// the process_batch function can call itself recursively, thus we need to
// accept as may InsideProcessBatch as possible before moving to the next state.
InsideProcessBatch = > ( ) ,
// the batch went successfully, we can stop the loop and go on with the next states.
ProcessBatchSucceeded = > break ,
AbortedIndexation = > panic! ( " The batch was aborted. " ) ,
ProcessBatchFailed = > panic! ( " The batch failed. " ) ,
breakpoint = > panic! ( " Encountered an impossible breakpoint ` {:?} `, this is probably an issue with the test suite. " , breakpoint ) ,
}
}
self . advance_till ( [ AfterProcessing ] ) ;
}
// Wait for one failed batch.
#[ track_caller ]
fn advance_one_failed_batch ( & mut self ) {
self . advance_till ( [ Start , BatchCreated ] ) ;
loop {
match self . advance ( ) {
// the process_batch function can call itself recursively, thus we need to
// accept as may InsideProcessBatch as possible before moving to the next state.
InsideProcessBatch = > ( ) ,
// the batch went failed, we can stop the loop and go on with the next states.
ProcessBatchFailed = > break ,
ProcessBatchSucceeded = > panic! ( " The batch succeeded. (and it wasn't supposed to sorry) " ) ,
AbortedIndexation = > panic! ( " The batch was aborted. " ) ,
breakpoint = > panic! ( " Encountered an impossible breakpoint ` {:?} `, this is probably an issue with the test suite. " , breakpoint ) ,
}
2022-10-24 23:29:17 +08:00
}
2022-11-28 23:27:41 +08:00
self . advance_till ( [ AfterProcessing ] ) ;
2022-10-24 23:29:17 +08:00
}
2022-10-03 22:15:10 +08:00
}
#[ test ]
fn register ( ) {
2022-10-10 23:02:28 +08:00
// In this test, the handle doesn't make any progress, we only check that the tasks are registered
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut _handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-03 22:15:10 +08:00
let kinds = [
2022-10-10 23:02:28 +08:00
index_creation_task ( " catto " , " mouse " ) ,
replace_document_import_task ( " catto " , None , 0 , 12 ) ,
replace_document_import_task ( " catto " , None , 1 , 50 ) ,
replace_document_import_task ( " doggo " , Some ( " bone " ) , 2 , 5000 ) ,
2022-10-03 22:15:10 +08:00
] ;
2022-10-20 19:11:50 +08:00
let ( _ , file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
file . persist ( ) . unwrap ( ) ;
let ( _ , file ) = index_scheduler . create_update_file_with_uuid ( 1 ) . unwrap ( ) ;
file . persist ( ) . unwrap ( ) ;
let ( _ , file ) = index_scheduler . create_update_file_with_uuid ( 2 ) . unwrap ( ) ;
file . persist ( ) . unwrap ( ) ;
2022-10-03 22:15:10 +08:00
for ( idx , kind ) in kinds . into_iter ( ) . enumerate ( ) {
let k = kind . as_kind ( ) ;
2024-02-21 18:21:26 +08:00
let task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-03 22:15:10 +08:00
assert_eq! ( task . uid , idx as u32 ) ;
assert_eq! ( task . status , Status ::Enqueued ) ;
2022-10-12 09:21:25 +08:00
assert_eq! ( task . kind . as_kind ( ) , k ) ;
2022-10-03 22:15:10 +08:00
}
2023-06-19 16:43:32 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " everything_is_successfully_registered " ) ;
2022-10-03 22:15:10 +08:00
}
#[ test ]
fn insert_task_while_another_task_is_processing ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-03 22:15:10 +08:00
2024-02-21 18:21:26 +08:00
index_scheduler . register ( index_creation_task ( " index_a " , " id " ) , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_batch_creation " ) ;
2022-10-20 19:11:50 +08:00
2022-10-03 22:15:10 +08:00
// while the task is processing can we register another task?
2024-02-21 18:21:26 +08:00
index_scheduler . register ( index_creation_task ( " index_b " , " id " ) , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-17 22:30:18 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::IndexDeletion { index_uid : S ( " index_a " ) } , None , false )
2022-10-03 22:15:10 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_third_task " ) ;
2022-10-03 22:15:10 +08:00
}
2023-06-23 07:24:25 +08:00
#[ test ]
fn test_task_is_processing ( ) {
2023-07-03 17:20:43 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2023-06-23 07:24:25 +08:00
2024-02-21 18:21:26 +08:00
index_scheduler . register ( index_creation_task ( " index_a " , " id " ) , None , false ) . unwrap ( ) ;
2023-06-23 07:24:25 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_a_task " ) ;
2023-07-03 17:20:43 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
2023-07-01 09:32:50 +08:00
assert! ( index_scheduler . is_task_processing ( ) . unwrap ( ) ) ;
2023-06-23 07:24:25 +08:00
}
2022-10-10 22:18:35 +08:00
/// We send a lot of tasks but notify the tasks scheduler only once as
/// we send them very fast, we must make sure that they are all processed.
#[ test ]
fn process_tasks_inserted_without_new_signal ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-10 22:18:35 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-10 22:18:35 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-10 22:18:35 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " cattos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-10 22:18:35 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-10 22:18:35 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::IndexDeletion { index_uid : S ( " doggos " ) } , None , false )
2022-10-10 22:18:35 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_third_task " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_first_task " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_second_task " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_third_task " ) ;
2022-10-10 22:18:35 +08:00
}
2022-10-10 23:00:56 +08:00
#[ test ]
fn process_tasks_without_autobatching ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( false , vec! [ ] ) ;
2022-10-10 23:00:56 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-10 23:00:56 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-10 23:00:56 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::DocumentClear { index_uid : S ( " doggos " ) } , None , false )
2022-10-10 23:00:56 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-10 23:00:56 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::DocumentClear { index_uid : S ( " doggos " ) } , None , false )
2022-10-10 23:00:56 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_third_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-10 23:00:56 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::DocumentClear { index_uid : S ( " doggos " ) } , None , false )
2022-10-10 23:00:56 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_fourth_task " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third " ) ;
2022-10-10 23:00:56 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " fourth " ) ;
2022-10-10 23:00:56 +08:00
}
2022-10-06 22:53:21 +08:00
#[ test ]
2022-10-10 23:02:28 +08:00
fn task_deletion_undeleteable ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-06 22:53:21 +08:00
2022-10-20 19:11:50 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
let ( file1 , documents_count1 ) = sample_documents ( & index_scheduler , 1 , 1 ) ;
file0 . persist ( ) . unwrap ( ) ;
file1 . persist ( ) . unwrap ( ) ;
2022-10-06 22:53:21 +08:00
let to_enqueue = [
2022-10-10 23:02:28 +08:00
index_creation_task ( " catto " , " mouse " ) ,
2022-10-20 19:11:50 +08:00
replace_document_import_task ( " catto " , None , 0 , documents_count0 ) ,
replace_document_import_task ( " doggo " , Some ( " bone " ) , 1 , documents_count1 ) ,
2022-10-06 22:53:21 +08:00
] ;
2022-10-20 19:11:50 +08:00
2022-10-06 22:53:21 +08:00
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-06 22:53:21 +08:00
}
2022-10-10 21:51:28 +08:00
2022-10-10 23:02:28 +08:00
// here we have registered all the tasks, but the index scheduler
// has not progressed at all
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_enqueued " ) ;
2022-10-06 22:53:21 +08:00
2022-10-10 23:02:28 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskDeletion {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 , 1 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-10 23:02:28 +08:00
. unwrap ( ) ;
// again, no progress made at all, but one more task is registered
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_deletion_enqueued " ) ;
2022-10-06 22:53:21 +08:00
2022-10-10 23:02:28 +08:00
// now we create the first batch
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
2022-10-06 22:53:21 +08:00
2022-10-10 23:02:28 +08:00
// the task deletion should now be "processing"
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_deletion_processing " ) ;
2022-10-06 22:53:21 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ InsideProcessBatch , ProcessBatchSucceeded , AfterProcessing ] ) ;
2022-10-10 23:02:28 +08:00
// after the task deletion is processed, no task should actually have been deleted,
// because the tasks with ids 0 and 1 were still "enqueued", and thus undeleteable
// the "task deletion" task should be marked as "succeeded" and, in its details, the
// number of deleted tasks should be 0
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_deletion_done " ) ;
2022-10-10 23:02:28 +08:00
}
#[ test ]
fn task_deletion_deleteable ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-10 23:02:28 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
let ( file1 , documents_count1 ) = sample_documents ( & index_scheduler , 1 , 1 ) ;
2022-10-20 19:11:50 +08:00
file0 . persist ( ) . unwrap ( ) ;
file1 . persist ( ) . unwrap ( ) ;
2022-10-10 23:02:28 +08:00
let to_enqueue = [
replace_document_import_task ( " catto " , None , 0 , documents_count0 ) ,
replace_document_import_task ( " doggo " , Some ( " bone " ) , 1 , documents_count1 ) ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-10 23:02:28 +08:00
}
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_enqueued " ) ;
2022-10-10 23:02:28 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-10 23:02:28 +08:00
// first addition of documents should be successful
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_processed " ) ;
2022-10-10 23:02:28 +08:00
// Now we delete the first task
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskDeletion {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-10 23:02:28 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_task_deletion " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-13 17:09:00 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_deletion_processed " ) ;
2022-10-06 22:53:21 +08:00
}
2022-10-15 17:38:43 +08:00
#[ test ]
fn task_deletion_delete_same_task_twice ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-15 17:38:43 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
let ( file1 , documents_count1 ) = sample_documents ( & index_scheduler , 1 , 1 ) ;
2022-10-20 19:11:50 +08:00
file0 . persist ( ) . unwrap ( ) ;
file1 . persist ( ) . unwrap ( ) ;
2022-10-15 17:38:43 +08:00
let to_enqueue = [
replace_document_import_task ( " catto " , None , 0 , documents_count0 ) ,
replace_document_import_task ( " doggo " , Some ( " bone " ) , 1 , documents_count1 ) ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-15 17:38:43 +08:00
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_enqueued " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-15 17:38:43 +08:00
// first addition of documents should be successful
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_processed " ) ;
// Now we delete the first task multiple times in a row
for _ in 0 .. 2 {
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskDeletion {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-15 17:38:43 +08:00
. unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-15 17:38:43 +08:00
}
2024-01-11 21:44:29 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-20 19:11:50 +08:00
2022-10-15 17:38:43 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_deletion_processed " ) ;
}
2022-10-03 22:15:10 +08:00
#[ test ]
fn document_addition ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-03 22:15:10 +08:00
let content = r #"
{
" id " : 1 ,
" doggo " : " bob "
} " #;
2022-10-10 21:51:28 +08:00
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-20 19:11:50 +08:00
file . persist ( ) . unwrap ( ) ;
2022-10-03 22:15:10 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-03 22:15:10 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_register " ) ;
2022-10-20 19:11:50 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_the_batch_creation " ) ;
2022-10-03 22:15:10 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ InsideProcessBatch , ProcessBatchSucceeded , AfterProcessing ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " once_everything_is_processed " ) ;
2022-10-03 22:15:10 +08:00
}
2022-10-19 22:44:42 +08:00
#[ test ]
fn document_addition_and_index_deletion ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-19 22:44:42 +08:00
let content = r #"
{
" id " : 1 ,
" doggo " : " bob "
} " #;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-19 22:44:42 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-19 22:44:42 +08:00
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-19 22:44:42 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-19 22:44:42 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
2022-10-20 19:11:50 +08:00
2022-10-19 22:44:42 +08:00
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::IndexDeletion { index_uid : S ( " doggos " ) } , None , false )
2022-10-19 22:44:42 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_third_task " ) ;
2022-10-19 22:44:42 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ; // The index creation.
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " before_index_creation " ) ;
handle . advance_one_successful_batch ( ) ; // // after the execution of the two tasks in a single batch.
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " both_task_succeeded " ) ;
2022-10-19 22:44:42 +08:00
}
2023-05-17 20:25:50 +08:00
#[ test ]
fn document_addition_and_document_deletion ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let content = r #" [
{ " id " : 1 , " doggo " : " jean bob " } ,
{ " id " : 2 , " catto " : " jorts " } ,
{ " id " : 3 , " doggo " : " bork " }
] " #;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-05-17 20:25:50 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-05-17 20:25:50 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentDeletion {
index_uid : S ( " doggos " ) ,
documents_ids : vec ! [ S ( " 1 " ) , S ( " 2 " ) ] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-05-17 20:25:50 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
handle . advance_one_successful_batch ( ) ; // The addition AND deletion should've been batched together
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_processing_the_batch " ) ;
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn document_deletion_and_document_addition ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentDeletion {
index_uid : S ( " doggos " ) ,
documents_ids : vec ! [ S ( " 1 " ) , S ( " 2 " ) ] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-05-17 20:25:50 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
let content = r #" [
{ " id " : 1 , " doggo " : " jean bob " } ,
{ " id " : 2 , " catto " : " jorts " } ,
{ " id " : 3 , " doggo " : " bork " }
] " #;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-05-17 20:25:50 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-05-17 20:25:50 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
// The deletion should have failed because it can't create an index
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_failing_the_deletion " ) ;
// The addition should works
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_last_successful_addition " ) ;
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
2022-10-13 16:57:33 +08:00
#[ test ]
fn do_not_batch_task_of_different_indexes ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-13 16:57:33 +08:00
let index_names = [ " doggos " , " cattos " , " girafos " ] ;
for name in index_names {
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation {
index_uid : name . to_string ( ) ,
primary_key : None ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-13 16:57:33 +08:00
. unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-13 16:57:33 +08:00
}
for name in index_names {
index_scheduler
2024-02-21 18:21:26 +08:00
. register (
KindWithContent ::DocumentClear { index_uid : name . to_string ( ) } ,
None ,
false ,
)
2022-10-13 16:57:33 +08:00
. unwrap ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-13 16:57:33 +08:00
}
for _ in 0 .. ( index_names . len ( ) * 2 ) {
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-20 19:11:50 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-13 16:57:33 +08:00
}
2022-10-27 17:17:50 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-13 16:57:33 +08:00
}
2022-10-17 22:30:18 +08:00
#[ test ]
fn swap_indexes ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-17 22:30:18 +08:00
let to_enqueue = [
index_creation_task ( " a " , " id " ) ,
index_creation_task ( " b " , " id " ) ,
index_creation_task ( " c " , " id " ) ,
index_creation_task ( " d " , " id " ) ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-24 14:12:03 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-17 22:30:18 +08:00
}
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " create_a " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " create_b " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " create_c " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " create_d " ) ;
2022-10-17 22:30:18 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexSwap {
swaps : vec ! [
IndexSwap { indexes : ( " a " . to_owned ( ) , " b " . to_owned ( ) ) } ,
IndexSwap { indexes : ( " c " . to_owned ( ) , " d " . to_owned ( ) ) } ,
] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-17 22:30:18 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_swap_registered " ) ;
2022-10-17 22:30:18 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexSwap {
swaps : vec ! [ IndexSwap { indexes : ( " a " . to_owned ( ) , " c " . to_owned ( ) ) } ] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-17 22:30:18 +08:00
. unwrap ( ) ;
2022-10-27 15:41:32 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " two_swaps_registered " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-27 15:41:32 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_swap_processed " ) ;
2022-10-24 14:12:03 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-17 22:30:18 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second_swap_processed " ) ;
2022-10-25 16:26:51 +08:00
2024-02-21 18:21:26 +08:00
index_scheduler
. register ( KindWithContent ::IndexSwap { swaps : vec ! [ ] } , None , false )
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 16:26:51 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third_empty_swap_processed " ) ;
2022-10-17 22:30:18 +08:00
}
2022-10-27 15:41:32 +08:00
#[ test ]
fn swap_indexes_errors ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-27 15:41:32 +08:00
let to_enqueue = [
index_creation_task ( " a " , " id " ) ,
index_creation_task ( " b " , " id " ) ,
index_creation_task ( " c " , " id " ) ,
index_creation_task ( " d " , " id " ) ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-27 15:41:32 +08:00
index_scheduler . assert_internally_consistent ( ) ;
}
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 4 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_the_index_creation " ) ;
2022-10-27 15:41:32 +08:00
let first_snap = snapshot_index_scheduler ( & index_scheduler ) ;
snapshot! ( first_snap , name : " initial_tasks_processed " ) ;
let err = index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexSwap {
swaps : vec ! [
IndexSwap { indexes : ( " a " . to_owned ( ) , " b " . to_owned ( ) ) } ,
IndexSwap { indexes : ( " b " . to_owned ( ) , " a " . to_owned ( ) ) } ,
] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-27 15:41:32 +08:00
. unwrap_err ( ) ;
snapshot! ( format! ( " {err} " ) , @ " Indexes must be declared only once during a swap. `a`, `b` were specified several times. " ) ;
let second_snap = snapshot_index_scheduler ( & index_scheduler ) ;
assert_eq! ( first_snap , second_snap ) ;
// Index `e` does not exist, but we don't check its existence yet
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexSwap {
swaps : vec ! [
IndexSwap { indexes : ( " a " . to_owned ( ) , " b " . to_owned ( ) ) } ,
IndexSwap { indexes : ( " c " . to_owned ( ) , " e " . to_owned ( ) ) } ,
IndexSwap { indexes : ( " d " . to_owned ( ) , " f " . to_owned ( ) ) } ,
] ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-27 15:41:32 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_failed_batch ( ) ;
2022-10-27 15:41:32 +08:00
// Now the first swap should have an error message saying `e` and `f` do not exist
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_swap_failed " ) ;
}
2022-10-20 19:18:25 +08:00
#[ test ]
fn document_addition_and_index_deletion_on_unexisting_index ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-20 19:18:25 +08:00
let content = r #"
{
" id " : 1 ,
" doggo " : " bob "
} " #;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-20 19:18:25 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-20 19:18:25 +08:00
. unwrap ( ) ;
index_scheduler
2024-02-21 18:21:26 +08:00
. register ( KindWithContent ::IndexDeletion { index_uid : S ( " doggos " ) } , None , false )
2022-10-20 19:18:25 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
2022-10-20 19:18:25 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
}
2022-10-25 17:42:14 +08:00
#[ test ]
fn cancel_enqueued_task ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 17:42:14 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
file0 . persist ( ) . unwrap ( ) ;
let to_enqueue = [
replace_document_import_task ( " catto " , None , 0 , documents_count0 ) ,
KindWithContent ::TaskCancelation {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-25 17:42:14 +08:00
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_tasks_enqueued " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_processed " ) ;
}
#[ test ]
fn cancel_succeeded_task ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 17:42:14 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
file0 . persist ( ) . unwrap ( ) ;
let _ = index_scheduler
2024-02-21 18:21:26 +08:00
. register ( replace_document_import_task ( " catto " , None , 0 , documents_count0 ) , None , false )
2022-10-25 17:42:14 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-25 17:42:14 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_task_processed " ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskCancelation {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 17:42:14 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_processed " ) ;
}
#[ test ]
fn cancel_processing_task ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 17:42:14 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
file0 . persist ( ) . unwrap ( ) ;
let _ = index_scheduler
2024-02-21 18:21:26 +08:00
. register ( replace_document_import_task ( " catto " , None , 0 , documents_count0 ) , None , false )
2022-10-25 17:42:14 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-25 17:42:14 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated , InsideProcessBatch ] ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " initial_task_processing " ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskCancelation {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 17:42:14 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_task_registered " ) ;
2022-10-25 17:42:14 +08:00
// Now we check that we can reach the AbortedIndexation error handling
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ AbortedIndexation ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " aborted_indexation " ) ;
2022-10-25 17:42:14 +08:00
2022-11-28 23:27:41 +08:00
// handle.advance_till([Start, BatchCreated, BeforeProcessing, AfterProcessing]);
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_processed " ) ;
}
#[ test ]
fn cancel_mix_of_tasks ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 17:42:14 +08:00
let ( file0 , documents_count0 ) = sample_documents ( & index_scheduler , 0 , 0 ) ;
file0 . persist ( ) . unwrap ( ) ;
let ( file1 , documents_count1 ) = sample_documents ( & index_scheduler , 1 , 1 ) ;
file1 . persist ( ) . unwrap ( ) ;
let ( file2 , documents_count2 ) = sample_documents ( & index_scheduler , 2 , 2 ) ;
file2 . persist ( ) . unwrap ( ) ;
let to_enqueue = [
replace_document_import_task ( " catto " , None , 0 , documents_count0 ) ,
replace_document_import_task ( " beavero " , None , 1 , documents_count1 ) ,
replace_document_import_task ( " wolfo " , None , 2 , documents_count2 ) ,
] ;
for task in to_enqueue {
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( task , None , false ) . unwrap ( ) ;
2022-10-25 17:42:14 +08:00
index_scheduler . assert_internally_consistent ( ) ;
}
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_task_processed " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated , InsideProcessBatch ] ) ;
2022-10-25 17:42:14 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskCancelation {
query : " test_query " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 , 1 , 2 ] ) ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 17:42:14 +08:00
. unwrap ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processing_second_task_cancel_enqueued " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ AbortedIndexation ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " aborted_indexation " ) ;
2022-10-25 17:42:14 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
2022-10-25 17:42:14 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_processed " ) ;
}
2022-10-24 23:29:17 +08:00
#[ test ]
fn test_document_replace ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-24 23:29:17 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-24 23:29:17 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-24 23:29:17 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-24 23:29:17 +08:00
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
// everything should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
2022-10-24 23:29:17 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-24 23:29:17 +08:00
}
#[ test ]
fn test_document_update ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-24 23:29:17 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-24 23:29:17 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : UpdateDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-24 23:29:17 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-24 23:29:17 +08:00
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
// everything should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
2022-10-24 23:29:17 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) ) ;
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-24 23:29:17 +08:00
}
#[ test ]
fn test_mixed_document_addition ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-24 23:29:17 +08:00
for i in 0 .. 10 {
let method = if i % 2 = = 0 { UpdateDocuments } else { ReplaceDocuments } ;
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-24 23:29:17 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-24 23:29:17 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-24 23:29:17 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-24 23:29:17 +08:00
// Only half of the task should've been processed since we can't autobatch replace and update together.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " five_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-24 23:29:17 +08:00
}
2024-03-26 17:36:56 +08:00
#[ test ]
fn test_settings_update ( ) {
use meilisearch_types ::settings ::{ Settings , Unchecked } ;
use milli ::update ::Setting ;
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let mut new_settings : Box < Settings < Unchecked > > = Box ::default ( ) ;
let mut embedders = BTreeMap ::default ( ) ;
let embedding_settings = milli ::vector ::settings ::EmbeddingSettings {
source : Setting ::Set ( milli ::vector ::settings ::EmbedderSource ::Rest ) ,
api_key : Setting ::Set ( S ( " My super secret " ) ) ,
url : Setting ::Set ( S ( " http://localhost:7777 " ) ) ,
2024-04-04 17:01:50 +08:00
dimensions : Setting ::Set ( 4 ) ,
2024-03-26 17:36:56 +08:00
.. Default ::default ( )
} ;
embedders . insert ( S ( " default " ) , Setting ::Set ( embedding_settings ) ) ;
new_settings . embedders = Setting ::Set ( embedders ) ;
index_scheduler
. register (
KindWithContent ::SettingsUpdate {
index_uid : S ( " doggos " ) ,
new_settings ,
is_deletion : false ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_settings_task " ) ;
{
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
let task = index_scheduler . get_task ( & rtxn , 0 ) . unwrap ( ) . unwrap ( ) ;
let task = meilisearch_types ::task_view ::TaskView ::from_task ( & task ) ;
insta ::assert_json_snapshot! ( task . details ) ;
}
handle . advance_n_successful_batches ( 1 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " settings_update_processed " ) ;
{
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
let task = index_scheduler . get_task ( & rtxn , 0 ) . unwrap ( ) . unwrap ( ) ;
let task = meilisearch_types ::task_view ::TaskView ::from_task ( & task ) ;
insta ::assert_json_snapshot! ( task . details ) ;
}
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let configs = index . embedding_configs ( & rtxn ) . unwrap ( ) ;
2024-05-22 21:27:09 +08:00
let ( name , embedding_config , user_provided ) = configs . first ( ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " default " ) ;
insta ::assert_debug_snapshot! ( user_provided , @ " RoaringBitmap<[]> " ) ;
2024-03-26 17:36:56 +08:00
insta ::assert_json_snapshot! ( embedding_config . embedder_options ) ;
}
2022-10-24 23:29:17 +08:00
#[ test ]
fn test_document_replace_without_autobatching ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( false , vec! [ ] ) ;
2022-10-24 23:29:17 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-24 23:29:17 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-24 23:29:17 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-24 23:29:17 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-24 23:29:17 +08:00
// Nothing should be batched thus half of the tasks are processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " five_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
// Everything is processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-24 23:29:17 +08:00
}
#[ test ]
fn test_document_update_without_autobatching ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( false , vec! [ ] ) ;
2022-10-24 23:29:17 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-24 23:29:17 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : UpdateDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-24 23:29:17 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-24 23:29:17 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-24 23:29:17 +08:00
// Nothing should be batched thus half of the tasks are processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " five_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
// Everything is processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-24 23:29:17 +08:00
// has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
2022-10-25 18:30:56 +08:00
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-25 18:30:56 +08:00
}
2022-10-25 21:30:36 +08:00
#[ macro_export ]
macro_rules ! debug_snapshot {
( $value :expr , @ $snapshot :literal ) = > { {
let value = format! ( " {:?} " , $value ) ;
meili_snap ::snapshot! ( value , @ $snapshot ) ;
} } ;
}
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
#[ test ]
fn simple_new ( ) {
crate ::IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 18:30:56 +08:00
}
2022-10-26 18:56:01 +08:00
#[ test ]
fn query_tasks_from_and_limit ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-26 18:56:01 +08:00
let kind = index_creation_task ( " doggo " , " bone " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-26 18:56:01 +08:00
let kind = index_creation_task ( " whalo " , " plankton " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_second_task " ) ;
2022-10-26 18:56:01 +08:00
let kind = index_creation_task ( " catto " , " his_own_vomit " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_third_task " ) ;
2022-10-26 18:56:01 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 3 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_all_tasks " ) ;
2022-10-26 18:56:01 +08:00
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
2022-10-26 18:56:01 +08:00
let query = Query { limit : Some ( 0 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
let query = Query { limit : Some ( 1 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
let query = Query { limit : Some ( 2 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,2,] " ) ;
let query = Query { from : Some ( 1 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,] " ) ;
let query = Query { from : Some ( 2 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,2,] " ) ;
let query = Query { from : Some ( 1 ) , limit : Some ( 1 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,] " ) ;
let query = Query { from : Some ( 1 ) , limit : Some ( 2 ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-26 18:56:01 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,] " ) ;
}
2022-10-25 18:30:56 +08:00
#[ test ]
2022-10-27 17:17:50 +08:00
fn query_tasks_simple ( ) {
2022-10-25 21:30:36 +08:00
let start_time = OffsetDateTime ::now_utc ( ) ;
2022-10-25 18:30:56 +08:00
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) =
2022-10-25 21:30:36 +08:00
IndexScheduler ::test ( true , vec! [ ( 3 , FailureLocation ::InsideProcessBatch ) ] ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
let kind = index_creation_task ( " doggo " , " sheep " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
let kind = index_creation_task ( " whalo " , " fish " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " start " ) ;
2022-10-25 18:30:56 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
2022-10-25 18:30:56 +08:00
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
let query = Query { statuses : Some ( vec! [ Status ::Processing ] ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,] " ) ; // only the processing tasks in the first tick
2022-10-25 18:30:56 +08:00
2022-11-28 23:27:41 +08:00
let query = Query { statuses : Some ( vec! [ Status ::Enqueued ] ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,2,] " ) ; // only the enqueued tasks in the first tick
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,2,] " ) ; // both enqueued and processing tasks in the first tick
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( start_time ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both enqueued and processing tasks in the first tick, but limited to those with a started_at
// that comes after the start of the test, which should excludes the enqueued tasks
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,] " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
before_started_at : Some ( start_time ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both enqueued and processing tasks in the first tick, but limited to those with a started_at
// that comes before the start of the test, which should excludes all of them
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( start_time ) ,
before_started_at : Some ( start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both enqueued and processing tasks in the first tick, but limited to those with a started_at
// that comes after the start of the test and before one minute after the start of the test,
// which should exclude the enqueued tasks and include the only processing task
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,] " ) ;
2022-10-25 18:30:56 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [
InsideProcessBatch ,
InsideProcessBatch ,
ProcessBatchSucceeded ,
AfterProcessing ,
Start ,
BatchCreated ,
] ) ;
2022-10-25 18:30:56 +08:00
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
let second_start_time = OffsetDateTime ::now_utc ( ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Succeeded , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( start_time ) ,
before_started_at : Some ( start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both succeeded and processing tasks in the first tick, but limited to those with a started_at
// that comes after the start of the test and before one minute after the start of the test,
// which should include all tasks
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,] " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Succeeded , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
before_started_at : Some ( start_time ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both succeeded and processing tasks in the first tick, but limited to those with a started_at
// that comes before the start of the test, which should exclude all tasks
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Succeeded , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// both succeeded and processing tasks in the first tick, but limited to those with a started_at
// that comes after the start of the second part of the test and before one minute after the
// second start of the test, which should exclude all tasks
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
2022-11-28 23:27:41 +08:00
handle . advance_till ( [
InsideProcessBatch ,
InsideProcessBatch ,
ProcessBatchSucceeded ,
AfterProcessing ,
Start ,
BatchCreated ,
] ) ;
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// we run the same query to verify that, and indeed find that the last task is matched
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Enqueued , Status ::Succeeded , Status ::Processing ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// enqueued, succeeded, or processing tasks started after the second part of the test, should
// again only return the last task
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ ProcessBatchFailed , AfterProcessing ] ) ;
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// now the last task should have failed
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " end " ) ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// so running the last query should return nothing
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Failed ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// but the same query on failed tasks should return the last task
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Failed ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// but the same query on failed tasks should return the last task
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Failed ] ) ,
uids : Some ( vec! [ 1 ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// same query but with an invalid uid
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
let query = Query {
2022-11-28 23:27:41 +08:00
statuses : Some ( vec! [ Status ::Failed ] ) ,
uids : Some ( vec! [ 2 ] ) ,
2022-10-25 21:30:36 +08:00
after_started_at : Some ( second_start_time ) ,
before_started_at : Some ( second_start_time + Duration ::minutes ( 1 ) ) ,
.. Default ::default ( )
} ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
// same query but with a valid uid
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [2,] " ) ;
}
2022-10-25 18:30:56 +08:00
2022-10-27 17:17:50 +08:00
#[ test ]
fn query_tasks_special_rules ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) =
2022-10-27 17:17:50 +08:00
IndexScheduler ::test ( true , vec! [ ( 3 , FailureLocation ::InsideProcessBatch ) ] ) ;
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-27 17:17:50 +08:00
let kind = index_creation_task ( " doggo " , " sheep " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-27 17:17:50 +08:00
let kind = KindWithContent ::IndexSwap {
swaps : vec ! [ IndexSwap { indexes : ( " catto " . to_owned ( ) , " doggo " . to_owned ( ) ) } ] ,
} ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-27 17:17:50 +08:00
let kind = KindWithContent ::IndexSwap {
swaps : vec ! [ IndexSwap { indexes : ( " catto " . to_owned ( ) , " whalo " . to_owned ( ) ) } ] ,
} ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-27 17:17:50 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " start " ) ;
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated ] ) ;
2022-10-27 17:17:50 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
let query = Query { index_uids : Some ( vec! [ " catto " . to_owned ( ) ] ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-27 17:17:50 +08:00
// only the first task associated with catto is returned, the indexSwap tasks are excluded!
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,] " ) ;
2022-11-28 23:27:41 +08:00
let query = Query { index_uids : Some ( vec! [ " catto " . to_owned ( ) ] ) , .. Default ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-02 01:21:45 +08:00
. get_task_ids_from_authorized_indexes (
& rtxn ,
& query ,
2023-02-19 21:40:25 +08:00
& AuthFilter ::with_allowed_indexes (
vec! [ IndexUidPattern ::new_unchecked ( " doggo " ) ] . into_iter ( ) . collect ( ) ,
) ,
2023-02-02 01:21:45 +08:00
)
2022-10-27 17:17:50 +08:00
. unwrap ( ) ;
// we have asked for only the tasks associated with catto, but are only authorized to retrieve the tasks
// associated with doggo -> empty result
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [] " ) ;
let query = Query ::default ( ) ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-02 01:21:45 +08:00
. get_task_ids_from_authorized_indexes (
& rtxn ,
& query ,
2023-02-19 21:40:25 +08:00
& AuthFilter ::with_allowed_indexes (
vec! [ IndexUidPattern ::new_unchecked ( " doggo " ) ] . into_iter ( ) . collect ( ) ,
) ,
2023-02-02 01:21:45 +08:00
)
2022-10-27 17:17:50 +08:00
. unwrap ( ) ;
// we asked for all the tasks, but we are only authorized to retrieve the doggo tasks
// -> only the index creation of doggo should be returned
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,] " ) ;
let query = Query ::default ( ) ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2022-10-27 17:17:50 +08:00
. get_task_ids_from_authorized_indexes (
& rtxn ,
& query ,
2023-02-19 21:40:25 +08:00
& AuthFilter ::with_allowed_indexes (
vec! [
IndexUidPattern ::new_unchecked ( " catto " ) ,
IndexUidPattern ::new_unchecked ( " doggo " ) ,
]
. into_iter ( )
. collect ( ) ,
) ,
2022-10-27 17:17:50 +08:00
)
. unwrap ( ) ;
// we asked for all the tasks, but we are only authorized to retrieve the doggo and catto tasks
// -> all tasks except the swap of catto with whalo are returned
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,] " ) ;
2022-10-27 17:17:50 +08:00
let query = Query ::default ( ) ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-10-27 17:17:50 +08:00
// we asked for all the tasks with all index authorized -> all tasks returned
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [0,1,2,3,] " ) ;
}
2022-10-25 18:30:56 +08:00
#[ test ]
2022-11-28 23:27:41 +08:00
fn query_tasks_canceled_by ( ) {
let ( index_scheduler , mut handle ) =
IndexScheduler ::test ( true , vec! [ ( 3 , FailureLocation ::InsideProcessBatch ) ] ) ;
2022-10-25 18:30:56 +08:00
2022-11-28 23:27:41 +08:00
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
let kind = index_creation_task ( " doggo " , " sheep " ) ;
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
let kind = KindWithContent ::IndexSwap {
swaps : vec ! [ IndexSwap { indexes : ( " catto " . to_owned ( ) , " doggo " . to_owned ( ) ) } ] ,
} ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
let kind = KindWithContent ::TaskCancelation {
query : " test_query " . to_string ( ) ,
tasks : [ 0 , 1 , 2 , 3 ] . into_iter ( ) . collect ( ) ,
} ;
2024-02-21 18:21:26 +08:00
let task_cancelation = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " start " ) ;
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
let query = Query { canceled_by : Some ( vec! [ task_cancelation . uid ] ) , .. Query ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-19 21:40:25 +08:00
. get_task_ids_from_authorized_indexes ( & rtxn , & query , & AuthFilter ::default ( ) )
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
// 0 is not returned because it was not canceled, 3 is not returned because it is the uid of the
// taskCancelation itself
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,2,] " ) ;
let query = Query { canceled_by : Some ( vec! [ task_cancelation . uid ] ) , .. Query ::default ( ) } ;
2023-07-05 17:00:40 +08:00
let ( tasks , _ ) = index_scheduler
2023-02-02 01:21:45 +08:00
. get_task_ids_from_authorized_indexes (
& rtxn ,
& query ,
2023-02-19 21:40:25 +08:00
& AuthFilter ::with_allowed_indexes (
vec! [ IndexUidPattern ::new_unchecked ( " doggo " ) ] . into_iter ( ) . collect ( ) ,
) ,
2023-02-02 01:21:45 +08:00
)
2022-11-28 23:27:41 +08:00
. unwrap ( ) ;
// Return only 1 because the user is not authorized to see task 2
snapshot! ( snapshot_bitmap ( & tasks ) , @ " [1,] " ) ;
2022-10-25 21:30:36 +08:00
}
#[ test ]
fn fail_in_process_batch_for_index_creation ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) =
2022-10-25 21:30:36 +08:00
IndexScheduler ::test ( true , vec! [ ( 1 , FailureLocation ::InsideProcessBatch ) ] ) ;
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_register " ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_one_failed_batch ( ) ;
2022-10-25 21:30:36 +08:00
// Still in the first iteration
assert_eq! ( * index_scheduler . run_loop_iteration . read ( ) . unwrap ( ) , 1 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " index_creation_failed " ) ;
}
#[ test ]
fn fail_in_process_batch_for_document_addition ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) =
2022-10-25 21:30:36 +08:00
IndexScheduler ::test ( true , vec! [ ( 1 , FailureLocation ::InsideProcessBatch ) ] ) ;
let content = r #"
{
" id " : 1 ,
" doggo " : " bob "
} " #;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
file . persist ( ) . unwrap ( ) ;
2022-10-25 18:30:56 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 18:30:56 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
handle . advance_till ( [ Start , BatchCreated ] ) ;
2022-10-25 21:30:36 +08:00
snapshot! (
snapshot_index_scheduler ( & index_scheduler ) ,
name : " document_addition_batch_created "
) ;
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ ProcessBatchFailed , AfterProcessing ] ) ;
2022-10-25 21:30:36 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " document_addition_failed " ) ;
}
#[ test ]
fn fail_in_update_task_after_process_batch_success_for_document_addition ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test (
2022-10-25 21:30:36 +08:00
true ,
vec! [ ( 1 , FailureLocation ::UpdatingTaskAfterProcessBatchSuccess { task_uid : 0 } ) ] ,
) ;
let content = r #"
{
" id " : 1 ,
" doggo " : " bob "
} " #;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " document_addition_succeeded_but_index_scheduler_not_updated " ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ BatchCreated , InsideProcessBatch , ProcessBatchSucceeded ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_batch_succeeded " ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
// At this point the next time the scheduler will try to progress it should encounter
// a critical failure and have to wait for 1s before retrying anything.
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
let before_failure = Instant ::now ( ) ;
handle . advance_till ( [ Start ] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_failing_to_commit " ) ;
let failure_duration = before_failure . elapsed ( ) ;
assert! ( failure_duration . as_millis ( ) > = 1000 ) ;
2022-10-25 21:30:36 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [
BatchCreated ,
InsideProcessBatch ,
ProcessBatchSucceeded ,
AfterProcessing ,
] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " task_successfully_processed " ) ;
2022-10-25 21:30:36 +08:00
}
#[ test ]
fn test_document_addition_cant_create_index_without_index ( ) {
// We're going to autobatch multiple document addition that don't have
// the right to create an index while there is no index currently.
// Thus, everything should be batched together and a IndexDoesNotExists
// error should be throwed.
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 18:30:56 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 18:30:56 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : false ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 18:30:56 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-25 18:30:56 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// Everything should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_till ( [
Start ,
BatchCreated ,
InsideProcessBatch ,
ProcessBatchFailed ,
AfterProcessing ,
] ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_processing_the_10_tasks " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// The index should not exists.
snapshot! ( format! ( " {} " , index_scheduler . index ( " doggos " ) . map ( | _ | ( ) ) . unwrap_err ( ) ) , @ " Index `doggos` not found. " ) ;
2022-10-25 18:30:56 +08:00
}
#[ test ]
2022-10-25 21:30:36 +08:00
fn test_document_addition_cant_create_index_without_index_without_autobatching ( ) {
// We're going to execute multiple document addition that don't have
// the right to create an index while there is no index currently.
// Since the autobatching is disabled, every tasks should be processed
// sequentially and throw an IndexDoesNotExists.
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( false , vec! [ ] ) ;
2022-10-25 18:30:56 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 18:30:56 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : false ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 18:30:56 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-25 18:30:56 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// Nothing should be batched thus half of the tasks are processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_failed_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " five_tasks_processed " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// Everything is processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_failed_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-25 21:30:36 +08:00
// The index should not exists.
snapshot! ( format! ( " {} " , index_scheduler . index ( " doggos " ) . map ( | _ | ( ) ) . unwrap_err ( ) ) , @ " Index `doggos` not found. " ) ;
2022-10-25 18:30:56 +08:00
}
#[ test ]
2022-10-25 21:30:36 +08:00
fn test_document_addition_cant_create_index_with_index ( ) {
// We're going to autobatch multiple document addition that don't have
// the right to create an index while there is already an index.
// Thus, everything should be batched together and no error should be
// throwed.
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// Create the index.
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_first_task " ) ;
2022-10-25 21:30:36 +08:00
2022-10-25 18:30:56 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 18:30:56 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : false ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 18:30:56 +08:00
. unwrap ( ) ;
2022-10-25 21:30:36 +08:00
index_scheduler . assert_internally_consistent ( ) ;
2022-10-25 18:30:56 +08:00
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-25 18:30:56 +08:00
2022-10-25 21:30:36 +08:00
// Everything should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 1 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_processing_the_10_tasks " ) ;
2022-10-25 18:30:56 +08:00
// Has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
2022-10-24 23:29:17 +08:00
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-24 23:29:17 +08:00
}
2022-09-21 18:01:46 +08:00
#[ test ]
2022-10-25 21:30:36 +08:00
fn test_document_addition_cant_create_index_with_index_without_autobatching ( ) {
// We're going to execute multiple document addition that don't have
// the right to create an index while there is no index currently.
// Since the autobatching is disabled, every tasks should be processed
// sequentially and throw an IndexDoesNotExists.
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( false , vec! [ ] ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Create the index.
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_first_task " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : false ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
2022-10-20 23:11:44 +08:00
index_scheduler . assert_internally_consistent ( ) ;
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Nothing should be batched thus half of the tasks are processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " five_tasks_processed " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Everything is processed.
2022-11-28 23:27:41 +08:00
handle . advance_n_successful_batches ( 5 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-20 23:11:44 +08:00
}
#[ test ]
2022-10-25 21:30:36 +08:00
fn test_document_addition_mixed_rights_with_index ( ) {
// We're going to autobatch multiple document addition.
// - The index already exists
// - The first document addition don't have the right to create an index
// can it batch with the other one?
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Create the index.
2022-10-20 23:11:44 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggos " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-20 23:11:44 +08:00
. unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " processed_the_first_task " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
2022-10-26 21:14:46 +08:00
let allow_index_creation = i % 2 ! = 0 ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-25 21:30:36 +08:00
// Everything should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-25 21:30:36 +08:00
// Has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-20 23:11:44 +08:00
}
#[ test ]
2022-10-25 21:30:36 +08:00
fn test_document_addition_mixed_right_without_index_starts_with_cant_create ( ) {
// We're going to autobatch multiple document addition.
// - The index does not exists
// - The first document addition don't have the right to create an index
// - The second do. They should not batch together.
// - The second should batch with everything else as it's going to create an index.
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
for i in 0 .. 10 {
let content = format! (
r #" {{
" id " : { } ,
" doggo " : " bob {} "
} } " #,
i , i
) ;
2022-10-26 21:14:46 +08:00
let allow_index_creation = i % 2 ! = 0 ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( i ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2022-10-25 21:30:36 +08:00
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2022-10-25 21:30:36 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_10_tasks " ) ;
2022-10-24 20:16:14 +08:00
2022-10-25 21:30:36 +08:00
// A first batch should be processed with only the first documentAddition that's going to fail.
2022-11-28 23:27:41 +08:00
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " only_first_task_failed " ) ;
2022-10-20 23:11:44 +08:00
2022-10-25 21:30:36 +08:00
// Everything else should be batched together.
2022-11-28 23:27:41 +08:00
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_tasks_processed " ) ;
2022-10-25 21:30:36 +08:00
// Has everything being pushed successfully in milli?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
2023-01-24 00:32:13 +08:00
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn test_document_addition_with_multiple_primary_key ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
for ( id , primary_key ) in [ " id " , " bork " , " bloup " ] . iter ( ) . enumerate ( ) {
let content = format! (
r #" {{
" id " : { id } ,
" doggo " : " jean bob "
} } " #,
) ;
let ( uuid , mut file ) =
index_scheduler . create_update_file_with_uuid ( id as u128 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-01-24 00:32:13 +08:00
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( primary_key ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-01-24 00:32:13 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_3_tasks " ) ;
// A first batch should be processed with only the first documentAddition.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " only_first_task_succeed " ) ;
// The second batch should fail.
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second_task_fails " ) ;
// The second batch should fail.
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third_task_fails " ) ;
// Is the primary key still what we expect?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " id " ) ;
// Is the document still the one we expect?.
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn test_document_addition_with_multiple_primary_key_batch_wrong_key ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
for ( id , primary_key ) in [ " id " , " bork " , " bork " ] . iter ( ) . enumerate ( ) {
let content = format! (
r #" {{
" id " : { id } ,
" doggo " : " jean bob "
} } " #,
) ;
let ( uuid , mut file ) =
index_scheduler . create_update_file_with_uuid ( id as u128 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-01-24 00:32:13 +08:00
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( primary_key ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-01-24 00:32:13 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_3_tasks " ) ;
// A first batch should be processed with only the first documentAddition.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " only_first_task_succeed " ) ;
// The second batch should fail and contains two tasks.
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second_and_third_tasks_fails " ) ;
// Is the primary key still what we expect?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " id " ) ;
// Is the document still the one we expect?.
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn test_document_addition_with_bad_primary_key ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
for ( id , primary_key ) in [ " bork " , " bork " , " id " , " bork " , " id " ] . iter ( ) . enumerate ( ) {
let content = format! (
r #" {{
" id " : { id } ,
" doggo " : " jean bob "
} } " #,
) ;
let ( uuid , mut file ) =
index_scheduler . create_update_file_with_uuid ( id as u128 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-01-24 00:32:13 +08:00
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( primary_key ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-01-24 00:32:13 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_5_tasks " ) ;
// A first batch should be processed with only the first two documentAddition.
// it should fails because the documents don't contains any `bork` field.
// NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_and_second_task_fails " ) ;
// The primary key should be set to none since we failed the batch.
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) ;
snapshot! ( primary_key . is_none ( ) , @ " true " ) ;
// The second batch should succeed and only contains one task.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third_task_succeeds " ) ;
// The primary key should be set to `id` since this batch succeeded.
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " id " ) ;
// We're trying to `bork` again, but now there is already a primary key set for this index.
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " fourth_task_fails " ) ;
// Finally the last task should succeed since its primary key is the same as the valid one.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " fifth_task_succeeds " ) ;
// Is the primary key still what we expect?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " id " ) ;
// Is the document still the one we expect?.
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn test_document_addition_with_set_and_null_primary_key ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
for ( id , primary_key ) in
[ None , Some ( " bork " ) , Some ( " paw " ) , None , None , Some ( " paw " ) ] . into_iter ( ) . enumerate ( )
{
let content = format! (
r #" {{
" paw " : { id } ,
" doggo " : " jean bob "
} } " #,
) ;
let ( uuid , mut file ) =
index_scheduler . create_update_file_with_uuid ( id as u128 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-01-24 00:32:13 +08:00
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : primary_key . map ( | pk | pk . to_string ( ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-01-24 00:32:13 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_6_tasks " ) ;
// A first batch should contains only one task that fails because we can't infer the primary key.
// NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_task_fails " ) ;
// The second batch should contains only one task that fails because we bork is not a valid primary key.
// NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second_task_fails " ) ;
// No primary key should be set at this point.
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) ;
snapshot! ( primary_key . is_none ( ) , @ " true " ) ;
// The third batch should succeed and only contains one task.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third_task_succeeds " ) ;
// The primary key should be set to `id` since this batch succeeded.
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " paw " ) ;
// We should be able to batch together the next two tasks that don't specify any primary key
2023-01-24 03:16:16 +08:00
// + the last task that matches the current primary-key. Everything should succeed.
2023-01-24 00:32:13 +08:00
handle . advance_one_successful_batch ( ) ;
2023-01-24 03:16:16 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_other_tasks_succeeds " ) ;
2023-01-24 00:32:13 +08:00
// Is the primary key still what we expect?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " paw " ) ;
// Is the document still the one we expect?.
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
}
#[ test ]
fn test_document_addition_with_set_and_null_primary_key_inference_works ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
for ( id , primary_key ) in [ None , Some ( " bork " ) , Some ( " doggoid " ) , None , None , Some ( " doggoid " ) ]
. into_iter ( )
. enumerate ( )
{
let content = format! (
r #" {{
" doggoid " : { id } ,
" doggo " : " jean bob "
} } " #,
) ;
let ( uuid , mut file ) =
index_scheduler . create_update_file_with_uuid ( id as u128 ) . unwrap ( ) ;
2024-02-23 01:42:12 +08:00
let documents_count = read_json ( content . as_bytes ( ) , & mut file ) . unwrap ( ) ;
2023-01-24 00:32:13 +08:00
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : primary_key . map ( | pk | pk . to_string ( ) ) ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-01-24 00:32:13 +08:00
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
}
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_the_6_tasks " ) ;
// A first batch should contains only one task that succeed and sets the primary key to `doggoid`.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " first_task_succeed " ) ;
// Checking the primary key.
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) ;
snapshot! ( primary_key . is_none ( ) , @ " false " ) ;
// The second batch should contains only one task that fails because it tries to update the primary key to `bork`.
handle . advance_one_failed_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " second_task_fails " ) ;
// The third batch should succeed and only contains one task.
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " third_task_succeeds " ) ;
// We should be able to batch together the next two tasks that don't specify any primary key
2023-01-24 03:16:16 +08:00
// + the last task that matches the current primary-key. Everything should succeed.
2023-01-24 00:32:13 +08:00
handle . advance_one_successful_batch ( ) ;
2023-01-24 03:16:16 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " all_other_tasks_succeeds " ) ;
2023-01-24 00:32:13 +08:00
// Is the primary key still what we expect?
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let primary_key = index . primary_key ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
snapshot! ( primary_key , @ " doggoid " ) ;
// Is the document still the one we expect?.
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string_pretty ( & documents ) . unwrap ( ) , name : " documents " ) ;
2022-10-20 23:11:44 +08:00
}
#[ test ]
fn panic_in_process_batch_for_index_creation ( ) {
2022-11-28 23:27:41 +08:00
let ( index_scheduler , mut handle ) =
2022-10-20 23:11:44 +08:00
IndexScheduler ::test ( true , vec! [ ( 1 , FailureLocation ::PanicInsideProcessBatch ) ] ) ;
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2022-11-28 23:27:41 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " registered_the_first_task " ) ;
2022-10-20 23:11:44 +08:00
2022-11-28 23:27:41 +08:00
handle . advance_till ( [ Start , BatchCreated , ProcessBatchFailed , AfterProcessing ] ) ;
2022-10-20 23:11:44 +08:00
// Still in the first iteration
assert_eq! ( * index_scheduler . run_loop_iteration . read ( ) . unwrap ( ) , 1 ) ;
// No matter what happens in process_batch, the index_scheduler should be internally consistent
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " index_creation_failed " ) ;
2022-09-21 18:01:46 +08:00
}
2023-04-25 02:04:50 +08:00
#[ test ]
2023-04-25 23:26:34 +08:00
fn test_task_queue_is_full ( ) {
let ( index_scheduler , mut handle ) =
IndexScheduler ::test_with_custom_config ( vec! [ ] , | config | {
// that's the minimum map size possible
config . task_db_size = 1048576 ;
} ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 02:04:50 +08:00
. unwrap ( ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_successful_batch ( ) ;
// on average this task takes ~600 bytes
loop {
2023-09-07 17:16:51 +08:00
let result = index_scheduler . register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
) ;
2023-04-25 23:26:34 +08:00
if result . is_err ( ) {
break ;
}
handle . advance_one_failed_batch ( ) ;
2023-04-25 02:04:50 +08:00
}
index_scheduler . assert_internally_consistent ( ) ;
2023-04-25 23:26:34 +08:00
// at this point the task DB shoud have reached its limit and we should not be able to register new tasks
2023-04-25 02:04:50 +08:00
let result = index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 02:04:50 +08:00
. unwrap_err ( ) ;
2023-04-25 23:26:34 +08:00
snapshot! ( result , @ " Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations. " ) ;
// we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
snapshot! ( format! ( " {:?} " , result . error_code ( ) ) , @ " NoSpaceLeftOnDevice " ) ;
2023-04-25 02:04:50 +08:00
2023-04-25 23:26:34 +08:00
// Even the task deletion that doesn't delete anything shouldn't be accepted
let result = index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskDeletion { query : S ( " test " ) , tasks : RoaringBitmap ::new ( ) } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 23:26:34 +08:00
. unwrap_err ( ) ;
2023-04-25 02:04:50 +08:00
snapshot! ( result , @ " Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations. " ) ;
// we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
snapshot! ( format! ( " {:?} " , result . error_code ( ) ) , @ " NoSpaceLeftOnDevice " ) ;
2023-04-25 23:26:34 +08:00
// But a task deletion that delete something should works
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::TaskDeletion { query : S ( " test " ) , tasks : ( 0 .. 100 ) . collect ( ) } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 02:04:50 +08:00
. unwrap ( ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_successful_batch ( ) ;
2023-04-25 02:04:50 +08:00
2023-04-25 23:26:34 +08:00
// Now we should be able to enqueue a few tasks again
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 02:04:50 +08:00
. unwrap ( ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_failed_batch ( ) ;
}
#[ test ]
fn test_auto_deletion_of_tasks ( ) {
let ( index_scheduler , mut handle ) =
IndexScheduler ::test_with_custom_config ( vec! [ ] , | config | {
config . max_number_of_tasks = 2 ;
} ) ;
2023-04-25 02:04:50 +08:00
2023-04-25 23:26:34 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 23:26:34 +08:00
. unwrap ( ) ;
2023-04-25 02:04:50 +08:00
handle . advance_one_successful_batch ( ) ;
2023-04-25 23:26:34 +08:00
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 02:04:50 +08:00
. unwrap ( ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_failed_batch ( ) ;
// at this point the max number of tasks is reached
// we can still enqueue multiple tasks
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 23:26:34 +08:00
. unwrap ( ) ;
index_scheduler
2023-09-07 17:16:51 +08:00
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2023-09-07 17:16:51 +08:00
)
2023-04-25 23:26:34 +08:00
. unwrap ( ) ;
2023-04-26 19:55:02 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " } ) , name : " task_queue_is_full " ) ;
drop ( rtxn ) ;
2023-04-25 23:26:34 +08:00
2023-04-26 19:55:02 +08:00
// now we're above the max number of tasks
2023-04-25 23:26:34 +08:00
// and if we try to advance in the tick function a new task deletion should be enqueued
handle . advance_till ( [ Start , BatchCreated ] ) ;
2023-04-26 19:55:02 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " , " .**.original_filter " = > " [filter] " , " .**.query " = > " [query] " } ) , name : " task_deletion_have_been_enqueued " ) ;
drop ( rtxn ) ;
2023-04-25 23:26:34 +08:00
handle . advance_till ( [ InsideProcessBatch , ProcessBatchSucceeded , AfterProcessing ] ) ;
2023-04-26 19:55:02 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " , " .**.original_filter " = > " [filter] " , " .**.query " = > " [query] " } ) , name : " task_deletion_have_been_processed " ) ;
drop ( rtxn ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_failed_batch ( ) ;
// a new task deletion has been enqueued
handle . advance_one_successful_batch ( ) ;
2023-04-26 19:55:02 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " , " .**.original_filter " = > " [filter] " , " .**.query " = > " [query] " } ) , name : " after_the_second_task_deletion " ) ;
drop ( rtxn ) ;
2023-04-25 23:26:34 +08:00
handle . advance_one_failed_batch ( ) ;
handle . advance_one_successful_batch ( ) ;
2023-04-26 19:55:02 +08:00
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " , " .**.original_filter " = > " [filter] " , " .**.query " = > " [query] " } ) , name : " everything_has_been_processed " ) ;
drop ( rtxn ) ;
2023-04-25 02:04:50 +08:00
}
2023-08-07 17:12:08 +08:00
2024-02-20 19:16:50 +08:00
#[ test ]
fn test_disable_auto_deletion_of_tasks ( ) {
let ( index_scheduler , mut handle ) =
IndexScheduler ::test_with_custom_config ( vec! [ ] , | config | {
config . cleanup_enabled = false ;
config . max_number_of_tasks = 2 ;
} ) ;
index_scheduler
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2024-02-20 19:16:50 +08:00
)
. unwrap ( ) ;
handle . advance_one_successful_batch ( ) ;
index_scheduler
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2024-02-20 19:16:50 +08:00
)
. unwrap ( ) ;
handle . advance_one_failed_batch ( ) ;
// at this point the max number of tasks is reached
// we can still enqueue multiple tasks
index_scheduler
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2024-02-20 19:16:50 +08:00
)
. unwrap ( ) ;
index_scheduler
. register (
KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ,
None ,
2024-02-21 18:21:26 +08:00
false ,
2024-02-20 19:16:50 +08:00
)
. unwrap ( ) ;
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " } ) , name : " task_queue_is_full " ) ;
drop ( rtxn ) ;
// now we're above the max number of tasks
// and if we try to advance in the tick function no new task deletion should be enqueued
handle . advance_till ( [ Start , BatchCreated ] ) ;
let rtxn = index_scheduler . env . read_txn ( ) . unwrap ( ) ;
let tasks = index_scheduler . get_task_ids ( & rtxn , & Query { .. Default ::default ( ) } ) . unwrap ( ) ;
let tasks = index_scheduler . get_existing_tasks ( & rtxn , tasks ) . unwrap ( ) ;
snapshot! ( json_string! ( tasks , { " [].enqueuedAt " = > " [date] " , " [].startedAt " = > " [date] " , " [].finishedAt " = > " [date] " , " .**.original_filter " = > " [filter] " , " .**.query " = > " [query] " } ) , name : " task_deletion_have_not_been_enqueued " ) ;
drop ( rtxn ) ;
}
2023-08-07 17:12:08 +08:00
#[ test ]
fn basic_get_stats ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let kind = index_creation_task ( " catto " , " mouse " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2023-08-07 17:12:08 +08:00
let kind = index_creation_task ( " doggo " , " sheep " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2023-08-07 17:12:08 +08:00
let kind = index_creation_task ( " whalo " , " fish " ) ;
2024-02-21 18:21:26 +08:00
let _task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2023-08-07 17:12:08 +08:00
snapshot! ( json_string! ( index_scheduler . get_stats ( ) . unwrap ( ) ) , @ r ###"
{
" indexes " : {
" catto " : 1 ,
" doggo " : 1 ,
" whalo " : 1
} ,
" statuses " : {
" canceled " : 0 ,
" enqueued " : 3 ,
" failed " : 0 ,
" processing " : 0 ,
" succeeded " : 0
} ,
" types " : {
" documentAdditionOrUpdate " : 0 ,
" documentDeletion " : 0 ,
" dumpCreation " : 0 ,
" indexCreation " : 3 ,
" indexDeletion " : 0 ,
" indexSwap " : 0 ,
" indexUpdate " : 0 ,
" settingsUpdate " : 0 ,
" snapshotCreation " : 0 ,
" taskCancelation " : 0 ,
" taskDeletion " : 0
}
}
" ###);
handle . advance_till ( [ Start , BatchCreated ] ) ;
snapshot! ( json_string! ( index_scheduler . get_stats ( ) . unwrap ( ) ) , @ r ###"
{
" indexes " : {
" catto " : 1 ,
" doggo " : 1 ,
" whalo " : 1
} ,
" statuses " : {
" canceled " : 0 ,
" enqueued " : 2 ,
" failed " : 0 ,
" processing " : 1 ,
" succeeded " : 0
} ,
" types " : {
" documentAdditionOrUpdate " : 0 ,
" documentDeletion " : 0 ,
" dumpCreation " : 0 ,
" indexCreation " : 3 ,
" indexDeletion " : 0 ,
" indexSwap " : 0 ,
" indexUpdate " : 0 ,
" settingsUpdate " : 0 ,
" snapshotCreation " : 0 ,
" taskCancelation " : 0 ,
" taskDeletion " : 0
}
}
" ###);
handle . advance_till ( [
InsideProcessBatch ,
InsideProcessBatch ,
ProcessBatchSucceeded ,
AfterProcessing ,
Start ,
BatchCreated ,
] ) ;
snapshot! ( json_string! ( index_scheduler . get_stats ( ) . unwrap ( ) ) , @ r ###"
{
" indexes " : {
" catto " : 1 ,
" doggo " : 1 ,
" whalo " : 1
} ,
" statuses " : {
" canceled " : 0 ,
" enqueued " : 1 ,
" failed " : 0 ,
" processing " : 1 ,
" succeeded " : 1
} ,
" types " : {
" documentAdditionOrUpdate " : 0 ,
" documentDeletion " : 0 ,
" dumpCreation " : 0 ,
" indexCreation " : 3 ,
" indexDeletion " : 0 ,
" indexSwap " : 0 ,
" indexUpdate " : 0 ,
" settingsUpdate " : 0 ,
" snapshotCreation " : 0 ,
" taskCancelation " : 0 ,
" taskDeletion " : 0
}
}
" ###);
// now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
handle . advance_till ( [
InsideProcessBatch ,
InsideProcessBatch ,
ProcessBatchSucceeded ,
AfterProcessing ,
Start ,
BatchCreated ,
] ) ;
snapshot! ( json_string! ( index_scheduler . get_stats ( ) . unwrap ( ) ) , @ r ###"
{
" indexes " : {
" catto " : 1 ,
" doggo " : 1 ,
" whalo " : 1
} ,
" statuses " : {
" canceled " : 0 ,
" enqueued " : 0 ,
" failed " : 0 ,
" processing " : 1 ,
" succeeded " : 2
} ,
" types " : {
" documentAdditionOrUpdate " : 0 ,
" documentDeletion " : 0 ,
" dumpCreation " : 0 ,
" indexCreation " : 3 ,
" indexDeletion " : 0 ,
" indexSwap " : 0 ,
" indexUpdate " : 0 ,
" settingsUpdate " : 0 ,
" snapshotCreation " : 0 ,
" taskCancelation " : 0 ,
" taskDeletion " : 0
}
}
" ###);
}
2023-11-14 17:59:02 +08:00
#[ test ]
fn cancel_processing_dump ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let dump_creation = KindWithContent ::DumpCreation { keys : Vec ::new ( ) , instance_uid : None } ;
let dump_cancellation = KindWithContent ::TaskCancelation {
query : " cancel dump " . to_owned ( ) ,
tasks : RoaringBitmap ::from_iter ( [ 0 ] ) ,
} ;
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( dump_creation , None , false ) . unwrap ( ) ;
2023-11-14 17:59:02 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_dump_register " ) ;
handle . advance_till ( [ Start , BatchCreated , InsideProcessBatch ] ) ;
2024-02-21 18:21:26 +08:00
let _ = index_scheduler . register ( dump_cancellation , None , false ) . unwrap ( ) ;
2023-11-14 17:59:02 +08:00
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_registered " ) ;
snapshot! ( format! ( " {:?} " , handle . advance ( ) ) , @ " AbortedIndexation " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " cancel_processed " ) ;
}
2023-09-07 17:16:51 +08:00
#[ test ]
fn basic_set_taskid ( ) {
let ( index_scheduler , _handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let kind = KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ;
2024-02-21 18:21:26 +08:00
let task = index_scheduler . register ( kind , None , false ) . unwrap ( ) ;
2023-09-07 17:16:51 +08:00
snapshot! ( task . uid , @ " 0 " ) ;
let kind = KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ;
2024-02-21 18:21:26 +08:00
let task = index_scheduler . register ( kind , Some ( 12 ) , false ) . unwrap ( ) ;
2023-09-07 17:16:51 +08:00
snapshot! ( task . uid , @ " 12 " ) ;
let kind = KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ;
2024-02-21 18:21:26 +08:00
let error = index_scheduler . register ( kind , Some ( 5 ) , false ) . unwrap_err ( ) ;
2023-09-07 17:16:51 +08:00
snapshot! ( error , @ " Received bad task id: 5 should be >= to 13. " ) ;
}
2024-02-21 18:21:26 +08:00
#[ test ]
fn dry_run ( ) {
let ( index_scheduler , _handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let kind = KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ;
let task = index_scheduler . register ( kind , None , true ) . unwrap ( ) ;
snapshot! ( task . uid , @ " 0 " ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , @ r ###"
### Autobatching Enabled = true
### Processing Tasks :
[ ]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### All Tasks :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Status :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Kind :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Index Tasks :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Index Mapper :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Canceled By :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Enqueued At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Started At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Finished At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### File Store :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
" ###);
let kind = KindWithContent ::IndexCreation { index_uid : S ( " doggo " ) , primary_key : None } ;
let task = index_scheduler . register ( kind , Some ( 12 ) , true ) . unwrap ( ) ;
snapshot! ( task . uid , @ " 12 " ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , @ r ###"
### Autobatching Enabled = true
### Processing Tasks :
[ ]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### All Tasks :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Status :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Kind :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Index Tasks :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Index Mapper :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Canceled By :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Enqueued At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Started At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Finished At :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### File Store :
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
" ###);
}
2024-05-20 16:23:12 +08:00
#[ test ]
fn import_vectors ( ) {
use meilisearch_types ::settings ::{ Settings , Unchecked } ;
use milli ::update ::Setting ;
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let mut new_settings : Box < Settings < Unchecked > > = Box ::default ( ) ;
let mut embedders = BTreeMap ::default ( ) ;
let embedding_settings = milli ::vector ::settings ::EmbeddingSettings {
source : Setting ::Set ( milli ::vector ::settings ::EmbedderSource ::Rest ) ,
api_key : Setting ::Set ( S ( " My super secret " ) ) ,
url : Setting ::Set ( S ( " http://localhost:7777 " ) ) ,
dimensions : Setting ::Set ( 384 ) ,
.. Default ::default ( )
} ;
embedders . insert ( S ( " A_fakerest " ) , Setting ::Set ( embedding_settings ) ) ;
let embedding_settings = milli ::vector ::settings ::EmbeddingSettings {
source : Setting ::Set ( milli ::vector ::settings ::EmbedderSource ::HuggingFace ) ,
model : Setting ::Set ( S ( " sentence-transformers/all-MiniLM-L6-v2 " ) ) ,
revision : Setting ::Set ( S ( " e4ce9877abf3edfe10b0d82785e83bdcb973e22e " ) ) ,
document_template : Setting ::Set ( S ( " {{doc.doggo}} the {{doc.breed}} best doggo " ) ) ,
.. Default ::default ( )
} ;
embedders . insert ( S ( " B_small_hf " ) , Setting ::Set ( embedding_settings ) ) ;
new_settings . embedders = Setting ::Set ( embedders ) ;
index_scheduler
. register (
KindWithContent ::SettingsUpdate {
index_uid : S ( " doggos " ) ,
new_settings ,
is_deletion : false ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after_registering_settings_task_vectors " ) ;
{
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
let task = index_scheduler . get_task ( & rtxn , 0 ) . unwrap ( ) . unwrap ( ) ;
let task = meilisearch_types ::task_view ::TaskView ::from_task ( & task ) ;
insta ::assert_json_snapshot! ( task . details ) ;
}
handle . advance_n_successful_batches ( 1 ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " settings_update_processed_vectors " ) ;
{
let rtxn = index_scheduler . read_txn ( ) . unwrap ( ) ;
let task = index_scheduler . get_task ( & rtxn , 0 ) . unwrap ( ) . unwrap ( ) ;
let task = meilisearch_types ::task_view ::TaskView ::from_task ( & task ) ;
insta ::assert_json_snapshot! ( task . details ) ;
}
let ( fakerest_name , simple_hf_name , beagle_embed , lab_embed , patou_embed ) = {
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let configs = index . embedding_configs ( & rtxn ) . unwrap ( ) ;
// for consistency with the below
#[ allow(clippy::get_first) ]
2024-05-22 21:27:09 +08:00
let ( name , fakerest_config , user_provided ) = configs . get ( 0 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " A_fakerest " ) ;
insta ::assert_debug_snapshot! ( user_provided , @ " RoaringBitmap<[]> " ) ;
2024-05-20 16:23:12 +08:00
insta ::assert_json_snapshot! ( fakerest_config . embedder_options ) ;
let fakerest_name = name . clone ( ) ;
2024-05-22 21:27:09 +08:00
let ( name , simple_hf_config , user_provided ) = configs . get ( 1 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " B_small_hf " ) ;
insta ::assert_debug_snapshot! ( user_provided , @ " RoaringBitmap<[]> " ) ;
2024-05-20 16:23:12 +08:00
insta ::assert_json_snapshot! ( simple_hf_config . embedder_options ) ;
let simple_hf_name = name . clone ( ) ;
let configs = index_scheduler . embedders ( configs ) . unwrap ( ) ;
let ( hf_embedder , _ ) = configs . get ( & simple_hf_name ) . unwrap ( ) ;
let beagle_embed = hf_embedder . embed_one ( S ( " Intel the beagle best doggo " ) ) . unwrap ( ) ;
let lab_embed = hf_embedder . embed_one ( S ( " Max the lab best doggo " ) ) . unwrap ( ) ;
let patou_embed = hf_embedder . embed_one ( S ( " kefir the patou best doggo " ) ) . unwrap ( ) ;
( fakerest_name , simple_hf_name , beagle_embed , lab_embed , patou_embed )
} ;
// add one doc, specifying vectors
let doc = serde_json ::json! (
{
" id " : 0 ,
" doggo " : " Intel " ,
" breed " : " beagle " ,
" _vectors " : {
& fakerest_name : {
// this will never trigger regeneration, which is good because we can't actually generate with
// this embedder
" userProvided " : true ,
" embeddings " : beagle_embed ,
} ,
& simple_hf_name : {
// this will be regenerated on updates
" userProvided " : false ,
" embeddings " : lab_embed ,
} ,
" noise " : [ 0.1 , 0.2 , 0.3 ]
}
}
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 u128 ) . unwrap ( ) ;
let documents_count = read_json ( doc . to_string ( ) . as_bytes ( ) , & mut file ) . unwrap ( ) ;
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : Some ( S ( " id " ) ) ,
method : UpdateDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " after adding Intel " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " adding Intel succeeds " ) ;
// check embeddings
{
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
2024-05-22 21:27:09 +08:00
// Ensure the document have been inserted into the relevant bitamp
let configs = index . embedding_configs ( & rtxn ) . unwrap ( ) ;
// for consistency with the below
#[ allow(clippy::get_first) ]
let ( name , _config , user_defined ) = configs . get ( 0 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " A_fakerest " ) ;
insta ::assert_debug_snapshot! ( user_defined , @ " RoaringBitmap<[0]> " ) ;
let ( name , _config , user_defined ) = configs . get ( 1 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " B_small_hf " ) ;
insta ::assert_debug_snapshot! ( user_defined , @ " RoaringBitmap<[]> " ) ;
2024-05-20 16:23:12 +08:00
let embeddings = index . embeddings ( & rtxn , 0 ) . unwrap ( ) ;
assert_json_snapshot! ( embeddings [ & simple_hf_name ] [ 0 ] = = lab_embed , @ " true " ) ;
assert_json_snapshot! ( embeddings [ & fakerest_name ] [ 0 ] = = beagle_embed , @ " true " ) ;
let doc = index . documents ( & rtxn , std ::iter ::once ( 0 ) ) . unwrap ( ) [ 0 ] . 1 ;
let fields_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let doc = obkv_to_json (
& [
fields_ids_map . id ( " doggo " ) . unwrap ( ) ,
fields_ids_map . id ( " breed " ) . unwrap ( ) ,
fields_ids_map . id ( " _vectors " ) . unwrap ( ) ,
] ,
& fields_ids_map ,
doc ,
)
. unwrap ( ) ;
assert_json_snapshot! ( doc , { " ._vectors.A_fakerest.embeddings " = > " [vector] " } ) ;
}
// update the doc, specifying vectors
let doc = serde_json ::json! (
{
" id " : 0 ,
" doggo " : " kefir " ,
" breed " : " patou " ,
}
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 1 u128 ) . unwrap ( ) ;
let documents_count = read_json ( doc . to_string ( ) . as_bytes ( ) , & mut file ) . unwrap ( ) ;
assert_eq! ( documents_count , 1 ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : None ,
method : UpdateDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " Intel to kefir " ) ;
handle . advance_one_successful_batch ( ) ;
snapshot! ( snapshot_index_scheduler ( & index_scheduler ) , name : " Intel to kefir succeeds " ) ;
{
// check embeddings
{
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
2024-05-22 21:27:09 +08:00
// Ensure the document have been inserted into the relevant bitamp
let configs = index . embedding_configs ( & rtxn ) . unwrap ( ) ;
// for consistency with the below
#[ allow(clippy::get_first) ]
let ( name , _config , user_defined ) = configs . get ( 0 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " A_fakerest " ) ;
insta ::assert_debug_snapshot! ( user_defined , @ " RoaringBitmap<[0]> " ) ;
let ( name , _config , user_defined ) = configs . get ( 1 ) . unwrap ( ) ;
insta ::assert_snapshot! ( name , @ " B_small_hf " ) ;
insta ::assert_debug_snapshot! ( user_defined , @ " RoaringBitmap<[]> " ) ;
2024-05-20 16:23:12 +08:00
let embeddings = index . embeddings ( & rtxn , 0 ) . unwrap ( ) ;
// automatically changed to patou
assert_json_snapshot! ( embeddings [ & simple_hf_name ] [ 0 ] = = patou_embed , @ " true " ) ;
// remained beagle because set to userProvided
assert_json_snapshot! ( embeddings [ & fakerest_name ] [ 0 ] = = beagle_embed , @ " true " ) ;
let doc = index . documents ( & rtxn , std ::iter ::once ( 0 ) ) . unwrap ( ) [ 0 ] . 1 ;
let fields_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let doc = obkv_to_json (
& [
fields_ids_map . id ( " doggo " ) . unwrap ( ) ,
fields_ids_map . id ( " breed " ) . unwrap ( ) ,
fields_ids_map . id ( " _vectors " ) . unwrap ( ) ,
] ,
& fields_ids_map ,
doc ,
)
. unwrap ( ) ;
assert_json_snapshot! ( doc , { " ._vectors.A_fakerest.embeddings " = > " [vector] " } ) ;
}
}
}
2024-05-22 21:27:09 +08:00
#[ test ]
fn import_vectors_first_and_embedder_later ( ) {
let ( index_scheduler , mut handle ) = IndexScheduler ::test ( true , vec! [ ] ) ;
let content = serde_json ::json! (
[
{
" id " : 0 ,
" doggo " : " kefir " ,
} ,
{
" id " : 1 ,
" doggo " : " intel " ,
" _vectors " : {
" my_doggo_embedder " : vec ! [ 1 ; 384 ] ,
" unknown embedder " : vec ! [ 1 , 2 , 3 ] ,
}
} ,
{
" id " : 2 ,
" doggo " : " max " ,
" _vectors " : {
" my_doggo_embedder " : {
" userProvided " : true ,
" embeddings " : vec ! [ 2 ; 384 ] ,
} ,
" unknown embedder " : vec ! [ 4 , 5 ] ,
} ,
} ,
{
" id " : 3 ,
" doggo " : " marcel " ,
" _vectors " : {
" my_doggo_embedder " : {
" userProvided " : false ,
" embeddings " : vec ! [ 3 ; 384 ] ,
} ,
} ,
} ,
{
" id " : 4 ,
" doggo " : " sora " ,
" _vectors " : {
" my_doggo_embedder " : {
" userProvided " : false ,
} ,
} ,
} ,
]
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 0 as u128 ) . unwrap ( ) ;
let documents_count =
read_json ( serde_json ::to_string_pretty ( & content ) . unwrap ( ) . as_bytes ( ) , & mut file )
. unwrap ( ) ;
snapshot! ( documents_count , @ " 5 " ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : None ,
method : ReplaceDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
handle . advance_one_successful_batch ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
snapshot! ( serde_json ::to_string ( & documents ) . unwrap ( ) , name : " documents after initial push " ) ;
let mut setting = meilisearch_types ::settings ::Settings ::< Unchecked > ::default ( ) ;
setting . embedders = Setting ::Set ( maplit ::btreemap! {
S ( " my_doggo_embedder " ) = > Setting ::Set ( EmbeddingSettings {
source : Setting ::Set ( milli ::vector ::settings ::EmbedderSource ::HuggingFace ) ,
model : Setting ::Set ( S ( " sentence-transformers/all-MiniLM-L6-v2 " ) ) ,
revision : Setting ::Set ( S ( " e4ce9877abf3edfe10b0d82785e83bdcb973e22e " ) ) ,
document_template : Setting ::Set ( S ( " {{doc.doggo}} " ) ) ,
.. EmbeddingSettings ::default ( )
} )
} ) ;
index_scheduler
. register (
KindWithContent ::SettingsUpdate {
index_uid : S ( " doggos " ) ,
new_settings : Box ::new ( setting ) ,
is_deletion : false ,
allow_index_creation : false ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
handle . advance_one_successful_batch ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
let index = index_scheduler . index ( " doggos " ) . unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let field_ids_map = index . fields_ids_map ( & rtxn ) . unwrap ( ) ;
let field_ids = field_ids_map . ids ( ) . collect ::< Vec < _ > > ( ) ;
let documents = index
. all_documents ( & rtxn )
. unwrap ( )
. map ( | ret | obkv_to_json ( & field_ids , & field_ids_map , ret . unwrap ( ) . 1 ) . unwrap ( ) )
. collect ::< Vec < _ > > ( ) ;
// the all the vectors linked to the new specified embedder have been removed
// Only the unknown embedders stays in the document DB
snapshot! ( serde_json ::to_string ( & documents ) . unwrap ( ) , @ r ### "[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"### ) ;
let conf = index . embedding_configs ( & rtxn ) . unwrap ( ) ;
// even though we specified the vector for the ID 3, it shouldn't be marked
// as user provided since we explicitely marked it as NOT user provided.
snapshot! ( format! ( " {conf:#?} " ) , @ r ###"
[
(
" my_doggo_embedder " ,
EmbeddingConfig {
embedder_options : HuggingFace (
EmbedderOptions {
model : " sentence-transformers/all-MiniLM-L6-v2 " ,
revision : Some (
" e4ce9877abf3edfe10b0d82785e83bdcb973e22e " ,
) ,
distribution : None ,
} ,
) ,
prompt : PromptData {
template : " {{doc.doggo}} " ,
} ,
} ,
RoaringBitmap < [ 1 , 2 ] > ,
) ,
]
" ###);
let docid = index . external_documents_ids . get ( & rtxn , " 0 " ) . unwrap ( ) . unwrap ( ) ;
let embeddings = index . embeddings ( & rtxn , docid ) . unwrap ( ) ;
let embedding = & embeddings [ " my_doggo_embedder " ] ;
assert! ( ! embedding . is_empty ( ) , " {embedding:?} " ) ;
// the document with the id 3 should keep its original embedding
let docid = index . external_documents_ids . get ( & rtxn , " 3 " ) . unwrap ( ) . unwrap ( ) ;
let mut embeddings = Vec ::new ( ) ;
' vectors : for i in 0 ..= u8 ::MAX {
let reader = arroy ::Reader ::open ( & rtxn , 0 | ( i as u16 ) , index . vector_arroy )
. map ( Some )
. or_else ( | e | match e {
arroy ::Error ::MissingMetadata = > Ok ( None ) ,
e = > Err ( e ) ,
} )
. transpose ( ) ;
let Some ( reader ) = reader else {
break 'vectors ;
} ;
let embedding = reader . unwrap ( ) . item_vector ( & rtxn , docid ) . unwrap ( ) ;
if let Some ( embedding ) = embedding {
embeddings . push ( embedding )
} else {
break 'vectors ;
}
}
snapshot! ( embeddings . len ( ) , @ " 1 " ) ;
assert! ( embeddings [ 0 ] . iter ( ) . all ( | i | * i = = 3.0 ) , " {:?} " , embeddings [ 0 ] ) ;
// If we update marcel it should regenerate its embedding automatically
let content = serde_json ::json! (
[
{
" id " : 3 ,
" doggo " : " marvel " ,
} ,
{
" id " : 4 ,
" doggo " : " sorry " ,
} ,
]
) ;
let ( uuid , mut file ) = index_scheduler . create_update_file_with_uuid ( 1 as u128 ) . unwrap ( ) ;
let documents_count =
read_json ( serde_json ::to_string_pretty ( & content ) . unwrap ( ) . as_bytes ( ) , & mut file )
. unwrap ( ) ;
snapshot! ( documents_count , @ " 2 " ) ;
file . persist ( ) . unwrap ( ) ;
index_scheduler
. register (
KindWithContent ::DocumentAdditionOrUpdate {
index_uid : S ( " doggos " ) ,
primary_key : None ,
method : UpdateDocuments ,
content_file : uuid ,
documents_count ,
allow_index_creation : true ,
} ,
None ,
false ,
)
. unwrap ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
handle . advance_one_successful_batch ( ) ;
index_scheduler . assert_internally_consistent ( ) ;
// the document with the id 3 should have its original embedding updated
let docid = index . external_documents_ids . get ( & rtxn , " 3 " ) . unwrap ( ) . unwrap ( ) ;
let embeddings = index . embeddings ( & rtxn , docid ) . unwrap ( ) ;
let embedding = & embeddings [ " my_doggo_embedder " ] ;
assert! ( ! embedding . is_empty ( ) ) ;
/// TODO: it shouldn’ t be equal to 3.0
assert! ( embedding [ 0 ] . iter ( ) . all ( | i | * i = = 3.0 ) , " {:?} " , embedding [ 0 ] ) ;
// the document with the id 4 should generate an embedding
// let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
// let embeddings = index.embeddings(&rtxn, docid).unwrap();
// dbg!(&embeddings);
// let embedding = &embeddings["my_doggo_embedder"];
// assert!(!embedding.is_empty());
// assert!(embedding[0]);
}
2022-09-15 18:23:41 +08:00
}