From 8cb7001755b4f796df71a7a3f73688c0003911e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Sep 2024 10:57:52 +0200 Subject: [PATCH] Expose an experimental parameter to control the generation of prefix dbs --- index-scheduler/src/batch.rs | 9 ++++- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 34 ++++------------- .../src/analytics/segment_analytics.rs | 3 ++ meilisearch/src/lib.rs | 1 + meilisearch/src/option.rs | 11 ++++++ milli/src/index.rs | 5 +++ milli/src/update/index_documents/mod.rs | 37 +++++++++++++++---- 8 files changed, 67 insertions(+), 34 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index cd5525eea..3e73c21ce 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1288,7 +1288,11 @@ impl IndexScheduler { } } - let config = IndexDocumentsConfig { update_method: method, ..Default::default() }; + let config = IndexDocumentsConfig { + update_method: method, + compute_prefix_databases: self.compute_prefix_databases, + ..Default::default() + }; let embedder_configs = index.embedding_configs(index_wtxn)?; // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense) @@ -1398,6 +1402,7 @@ impl IndexScheduler { let deleted_documents = delete_document_by_filter( index_wtxn, filter, + self.compute_prefix_databases, self.index_mapper.indexer_config(), self.must_stop_processing.clone(), index, @@ -1638,6 +1643,7 @@ impl IndexScheduler { fn delete_document_by_filter<'a>( wtxn: &mut RwTxn<'a>, filter: &serde_json::Value, + compute_prefix_databases: bool, indexer_config: &IndexerConfig, must_stop_processing: MustStopProcessing, index: &'a Index, @@ -1653,6 +1659,7 @@ fn delete_document_by_filter<'a>( let config = IndexDocumentsConfig { update_method: IndexDocumentsMethod::ReplaceDocuments, + compute_prefix_databases, ..Default::default() }; diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index d8625a2c7..7d6f4f5be 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -32,6 +32,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { features: _, max_number_of_tasks: _, max_number_of_batched_tasks: _, + compute_prefix_databases: _, wake_up: _, dumps_path: _, snapshots_path: _, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 213ec3230..e15921e15 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -276,6 +276,8 @@ pub struct IndexSchedulerOptions { pub max_number_of_batched_tasks: usize, /// The experimental features enabled for this instance. pub instance_features: InstanceTogglableFeatures, + /// An experimental option to control the generation of prefix databases. + pub compute_prefix_databases: bool, } /// Structure which holds meilisearch's indexes and schedules the tasks @@ -283,19 +285,13 @@ pub struct IndexSchedulerOptions { pub struct IndexScheduler { /// The LMDB environment which the DBs are associated with. pub(crate) env: Env, - /// A boolean that can be set to true to stop the currently processing tasks. pub(crate) must_stop_processing: MustStopProcessing, - /// The list of tasks currently processing pub(crate) processing_tasks: Arc>, - /// The list of files referenced by the tasks - pub(crate) file_store: FileStore, - - // The main database, it contains all the tasks accessible by their Id. + pub(crate) file_store: FileStore, // The main database, it contains all the tasks accessible by their Id. pub(crate) all_tasks: Database>, - /// All the tasks ids grouped by their status. // TODO we should not be able to serialize a `Status::Processing` in this database. pub(crate) status: Database, RoaringBitmapCodec>, @@ -303,58 +299,43 @@ pub struct IndexScheduler { pub(crate) kind: Database, RoaringBitmapCodec>, /// Store the tasks associated to an index. pub(crate) index_tasks: Database, - /// Store the tasks that were canceled by a task uid pub(crate) canceled_by: Database, - /// Store the task ids of tasks which were enqueued at a specific date pub(crate) enqueued_at: Database, - /// Store the task ids of finished tasks which started being processed at a specific date pub(crate) started_at: Database, - /// Store the task ids of tasks which finished at a specific date pub(crate) finished_at: Database, - /// In charge of creating, opening, storing and returning indexes. pub(crate) index_mapper: IndexMapper, - /// In charge of fetching and setting the status of experimental features. features: features::FeatureData, - /// Get a signal when a batch needs to be processed. pub(crate) wake_up: Arc, - /// Whether auto-batching is enabled or not. pub(crate) autobatching_enabled: bool, - /// Whether we should automatically cleanup the task queue or not. pub(crate) cleanup_enabled: bool, - /// The max number of tasks allowed before the scheduler starts to delete /// the finished tasks automatically. pub(crate) max_number_of_tasks: usize, - /// The maximum number of tasks that will be batched together. pub(crate) max_number_of_batched_tasks: usize, - + /// Control wether we must generate the prefix databases or not. + pub(crate) compute_prefix_databases: bool, /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, /// The Authorization header to send to the webhook URL. pub(crate) webhook_authorization_header: Option, - /// The path used to create the dumps. pub(crate) dumps_path: PathBuf, - /// The path used to create the snapshots. pub(crate) snapshots_path: PathBuf, - /// The path to the folder containing the auth LMDB env. pub(crate) auth_path: PathBuf, - /// The path to the version file of Meilisearch. pub(crate) version_file_path: PathBuf, - embedders: Arc>>>, // ================= test @@ -364,13 +345,11 @@ pub struct IndexScheduler { /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation. #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, - /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler. /// /// The first field is the iteration index and the second field identifies a location in the code. #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>, - /// A counter that is incremented before every call to [`tick`](IndexScheduler::tick) #[cfg(test)] run_loop_iteration: Arc>, @@ -397,6 +376,7 @@ impl IndexScheduler { cleanup_enabled: self.cleanup_enabled, max_number_of_tasks: self.max_number_of_tasks, max_number_of_batched_tasks: self.max_number_of_batched_tasks, + compute_prefix_databases: self.compute_prefix_databases, snapshots_path: self.snapshots_path.clone(), dumps_path: self.dumps_path.clone(), auth_path: self.auth_path.clone(), @@ -499,6 +479,7 @@ impl IndexScheduler { cleanup_enabled: options.cleanup_enabled, max_number_of_tasks: options.max_number_of_tasks, max_number_of_batched_tasks: options.max_number_of_batched_tasks, + compute_prefix_databases: options.compute_prefix_databases, dumps_path: options.dumps_path, snapshots_path: options.snapshots_path, auth_path: options.auth_path, @@ -1819,6 +1800,7 @@ mod tests { max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: usize::MAX, instance_features: Default::default(), + compute_prefix_databases: true, }; configuration(&mut options); diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 94e4684d5..357c792dd 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -256,6 +256,7 @@ struct Infos { experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, + experimental_disable_prefix_db: bool, gpu_enabled: bool, db_path: bool, import_dump: bool, @@ -298,6 +299,7 @@ impl From for Infos { experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, + experimental_disable_prefix_db, http_addr, master_key: _, env, @@ -347,6 +349,7 @@ impl From for Infos { experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, + experimental_disable_prefix_db, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index bb7562c85..d8d866c78 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -311,6 +311,7 @@ fn open_or_create_database_unchecked( index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, index_count: DEFAULT_INDEX_COUNT, instance_features, + compute_prefix_databases: !opt.experimental_disable_prefix_db, })?) }; diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index fed824079..c831511b1 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -60,6 +60,7 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS"; +const MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB: &str = "MEILI_EXPERIMENTAL_DISABLE_PREFIXDB"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; @@ -389,6 +390,11 @@ pub struct Opt { #[serde(default = "default_limit_batched_tasks")] pub experimental_max_number_of_batched_tasks: usize, + /// Experimentally disable the prefix database, see: + #[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB)] + #[serde(default)] + pub experimental_disable_prefix_db: bool, + #[serde(flatten)] #[clap(flatten)] pub indexer_options: IndexerOpts, @@ -489,6 +495,7 @@ impl Opt { experimental_enable_logs_route, experimental_replication_parameters, experimental_reduce_indexing_memory_usage, + experimental_disable_prefix_db, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); @@ -518,6 +525,10 @@ impl Opt { MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, experimental_max_number_of_batched_tasks.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB, + experimental_disable_prefix_db.to_string(), + ); if let Some(ssl_cert_path) = ssl_cert_path { export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path); } diff --git a/milli/src/index.rs b/milli/src/index.rs index a9b118827..8662d8a62 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1230,6 +1230,11 @@ impl Index { ) } + /// Deletes the FST which is the words prefixes dictionary of the engine. + pub fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY) + } + /// Returns the FST which is the words prefixes dictionary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self.main.remap_types::().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c2c49be17..beaeac735 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -85,7 +85,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { embedders: EmbeddingConfigs, } -#[derive(Default, Debug, Clone)] +#[derive(Debug, Clone)] pub struct IndexDocumentsConfig { pub words_prefix_threshold: Option, pub max_prefix_length: Option, @@ -93,6 +93,21 @@ pub struct IndexDocumentsConfig { pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, pub autogenerate_docids: bool, + pub compute_prefix_databases: bool, +} + +impl Default for IndexDocumentsConfig { + fn default() -> Self { + Self { + words_prefix_threshold: Default::default(), + max_prefix_length: Default::default(), + words_positions_level_group_size: Default::default(), + words_positions_min_level_size: Default::default(), + update_method: Default::default(), + autogenerate_docids: Default::default(), + compute_prefix_databases: true, + } + } } impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA> @@ -558,12 +573,20 @@ where .map_err(InternalError::from)??; } - self.execute_prefix_databases( - word_docids.map(MergerBuilder::build), - exact_word_docids.map(MergerBuilder::build), - word_position_docids.map(MergerBuilder::build), - word_fid_docids.map(MergerBuilder::build), - )?; + if self.config.compute_prefix_databases { + self.execute_prefix_databases( + word_docids.map(MergerBuilder::build), + exact_word_docids.map(MergerBuilder::build), + word_position_docids.map(MergerBuilder::build), + word_fid_docids.map(MergerBuilder::build), + )?; + } else { + self.index.words_prefixes_fst(self.wtxn)?; + self.index.word_prefix_docids.clear(self.wtxn)?; + self.index.exact_word_prefix_docids.clear(self.wtxn)?; + self.index.word_prefix_position_docids.clear(self.wtxn)?; + self.index.word_prefix_fid_docids.clear(self.wtxn)?; + } Ok(number_of_documents) }