From e96b852107221f58a91ec1f023d606eeefce99af Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 10 Aug 2022 16:25:24 +0200 Subject: [PATCH 1/2] bump heed --- milli/Cargo.toml | 3 ++- milli/src/error.rs | 3 +++ milli/src/update/index_documents/mod.rs | 29 ++++++++++++++----------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2bb6a50a1..fbe756ac6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,8 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.3" diff --git a/milli/src/error.rs b/milli/src/error.rs index c817f64fa..d3f0a179f 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("{}", HeedError::BadOpenOptions)] + InvalidLmdbOpenOptions, #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] SortRankingRuleMissing, #[error("The database file is in an invalid state.")] @@ -244,6 +246,7 @@ impl From for Error { HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), + HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d1f030fdd..f5e04435d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -278,27 +278,30 @@ where let stop_words = self.index.stop_words(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + let pool_params = GrenadParameters { + chunk_compression_type: self.indexer_config.chunk_compression_type, + chunk_compression_level: self.indexer_config.chunk_compression_level, + max_memory: self.indexer_config.max_memory, + max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. + }; + let documents_chunk_size = + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; + // Run extraction pipeline in parallel. pool.install(|| { - let params = GrenadParameters { - chunk_compression_type: self.indexer_config.chunk_compression_type, - chunk_compression_level: self.indexer_config.chunk_compression_level, - max_memory: self.indexer_config.max_memory, - max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. - }; - // split obkv file into several chunks let original_chunk_iter = grenad_obkv_into_chunks( original_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); // split obkv file into several chunks let flattened_chunk_iter = grenad_obkv_into_chunks( flattened_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); let result = original_chunk_iter @@ -308,14 +311,14 @@ where extract::data_from_obkv_documents( original_chunk, flattened_chunk, - params, + pool_params, lmdb_writer_sx.clone(), searchable_fields, faceted_fields, primary_key_id, geo_fields_ids, stop_words, - self.indexer_config.max_positions_per_attributes, + max_positions_per_attributes, exact_attributes, ) }); From 4aae07d5f557fca6d4441e8214194943899290e9 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 11 Aug 2022 11:15:46 +0200 Subject: [PATCH 2/2] expose the size methods --- benchmarks/benches/indexing.rs | 2 +- benchmarks/benches/utils.rs | 2 +- cli/Cargo.toml | 1 - cli/src/main.rs | 2 +- helpers/Cargo.toml | 1 - helpers/src/main.rs | 2 +- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 2 +- http-ui/src/update_store.rs | 1 + infos/Cargo.toml | 1 - infos/src/main.rs | 2 +- milli/Cargo.toml | 3 +-- milli/src/index.rs | 10 ++++++++++ milli/src/update/delete_documents.rs | 26 +++++++++++++++++++++----- 14 files changed, 39 insertions(+), 17 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index d532c85d9..a409e1343 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all}; use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; -use heed::{EnvOpenOptions, RwTxn}; +use milli::heed::{EnvOpenOptions, RwTxn}; use milli::update::{ DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, }; diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index fba05edbe..8c556b383 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -6,8 +6,8 @@ use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; -use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 504df712e..e45fb3344 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] } color-eyre = "0.6.1" csv = "1.1.6" eyre = "0.6.7" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } indicatif = "0.16.2" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } diff --git a/cli/src/main.rs b/cli/src/main.rs index 8485560f5..e3bbced3e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::{Index, Object}; +use milli::{heed, Index, Object}; use structopt::StructOpt; #[global_allocator] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index bd09574f3..9a8496e28 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -9,7 +9,6 @@ publish = false [dependencies] anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } stderrlog = "0.5.1" diff --git a/helpers/src/main.rs b/helpers/src/main.rs index 0081965ad..d1050e937 100644 --- a/helpers/src/main.rs +++ b/helpers/src/main.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use byte_unit::Byte; -use heed::{CompactionOption, Env, EnvOpenOptions}; +use milli::heed::{CompactionOption, Env, EnvOpenOptions}; use structopt::StructOpt; use Command::*; diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 993818f93..6d902f5b3 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,6 @@ publish = false anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } crossbeam-channel = "0.5.2" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } memmap2 = "0.5.3" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index de5d3c5ab..afde8cc1a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -17,8 +17,8 @@ use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; -use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ diff --git a/http-ui/src/update_store.rs b/http-ui/src/update_store.rs index b77057fda..bbbff25c8 100644 --- a/http-ui/src/update_store.rs +++ b/http-ui/src/update_store.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use crossbeam_channel::Sender; use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; use heed::{Database, Env, EnvOpenOptions}; +use milli::heed; use serde::{Deserialize, Serialize}; pub type BEU64 = heed::zerocopy::U64; diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8c92ae649..7c17782c3 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -9,7 +9,6 @@ publish = false anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } csv = "1.1.6" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } roaring = "0.9.0" diff --git a/infos/src/main.rs b/infos/src/main.rs index 1fbd50889..f5fdcf94a 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -7,7 +7,7 @@ use byte_unit::Byte; use heed::EnvOpenOptions; use milli::facet::FacetType; use milli::index::db_name::*; -use milli::{FieldId, Index}; +use milli::{heed, FieldId, Index}; use structopt::StructOpt; use Command::*; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index fbe756ac6..1441461f3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,8 +18,7 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } -# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.3" diff --git a/milli/src/index.rs b/milli/src/index.rs index 36e15c181..0dccabf03 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -223,6 +223,16 @@ impl Index { self.env.path() } + /// Returns the size used by the index without the cached pages. + pub fn used_size(&self) -> Result { + Ok(self.env.non_free_pages_size()?) + } + + /// Returns the real size used by the index. + pub fn on_disk_size(&self) -> Result { + Ok(self.env.real_disk_size()?) + } + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { self.env.copy_to_path(path, option).map_err(Into::into) } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c981ee061..eae473f51 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -20,10 +20,6 @@ use crate::{ RoaringBitmapCodec, SmallString32, BEU32, }; -/// The threshold we use to determine after which number of documents we want to clear the -/// soft-deleted database and delete documents for real. -const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000; - pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // if we have less documents to delete than the threshold we simply save them in // the `soft_deleted_documents_ids` bitmap and early exit. - if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD { + let size_used = self.index.used_size()?; + let map_size = self.index.env.map_size()? as u64; + let nb_documents = self.index.number_of_documents(&self.wtxn)?; + let nb_soft_deleted = soft_deleted_docids.len(); + + let percentage_available = 100 - (size_used * 100 / map_size); + let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); + let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; + let percentage_used_by_soft_deleted_documents = + estimated_size_used_by_soft_deleted * 100 / map_size; + + // if we have more than 10% of disk space available and the soft deleted + // documents uses less than 10% of the total space available, + // we skip the deletion. Eg. + // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents + // We don’t delete anything. + // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents + // We run the deletion. + // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents + // We run the deletion. + if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(),