Merge #607

607: Better threshold r=Kerollmops a=irevoire # Pull Request ## What does this PR do? Fixes #570 This PR tries to improve the threshold used to trigger the real deletion of documents. The deletion is now triggered in two cases; - 10% of the total available space is used by soft deleted documents - 90% of the total available space is used. In this context, « total available space » means the `map_size` of lmdb. And the size used by the soft deleted documents is actually an estimation. We can't determine precisely the size used by one document thus what we do is; take the total space used, divide it by the number of documents + soft deleted documents to estimate the size of one average document. Then multiply the size of one avg document by the number of soft deleted document. -------- <img width="808" alt="image" src="https://user-images.githubusercontent.com/7032172/185083075-92cf379e-8ae1-4bfc-9ca6-93b54e6ab4e9.png"> Here we can see we have a ~10GB drift in the end between the space used by the soft deleted and the real space used by the documents. Personally I don’t think that's a big issue because once the red line reach 90GB everything will be freed but now you know. If you have an idea on how to improve this estimation I would love to hear it. It look like the difference is linear so maybe we could simply multiply the current estimation by two? Co-authored-by: Irevoire <tamo@meilisearch.com>
2024-11-27 04:25:06 +08:00 · 2022-08-17 16:31:04 +00:00 · 2022-08-17 16:31:04 +00:00 · 79094bcbcf
commit 79094bcbcf
parent 087da5621a 4aae07d5f5
16 changed files with 58 additions and 29 deletions
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all};
 use std::path::Path;

 use criterion::{criterion_group, criterion_main, Criterion};
-use heed::{EnvOpenOptions, RwTxn};
+use milli::heed::{EnvOpenOptions, RwTxn};
 use milli::update::{
    DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
 };
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -6,8 +6,8 @@ use std::num::ParseFloatError;
 use std::path::Path;

 use criterion::BenchmarkId;
-use heed::EnvOpenOptions;
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
+use milli::heed::EnvOpenOptions;
 use milli::update::{
    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
 };
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] }
 color-eyre = "0.6.1"
 csv = "1.1.6"
 eyre = "0.6.7"
-heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
 indicatif = "0.16.2"
 milli = { path = "../milli" }
 mimalloc = { version = "0.1.29", default-features = false }
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{
    ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
 use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
-use milli::{Index, Object};
+use milli::{heed, Index, Object};
 use structopt::StructOpt;

 #[global_allocator]
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@ -9,7 +9,6 @@ publish = false
 [dependencies]
 anyhow = "1.0.56"
 byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
-heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
 milli = { path = "../milli" }
 mimalloc = { version = "0.1.29", default-features = false }
 stderrlog = "0.5.1"
--- a/helpers/src/main.rs
+++ b/helpers/src/main.rs
@ -1,7 +1,7 @@
 use std::path::PathBuf;

 use byte_unit::Byte;
-use heed::{CompactionOption, Env, EnvOpenOptions};
+use milli::heed::{CompactionOption, Env, EnvOpenOptions};
 use structopt::StructOpt;
 use Command::*;

--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@ -10,7 +10,6 @@ publish = false
 anyhow = "1.0.56"
 byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
 crossbeam-channel = "0.5.2"
-heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
 memmap2 = "0.5.3"
 milli = { path = "../milli" }
 mimalloc = { version = "0.1.29", default-features = false }
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -17,8 +17,8 @@ use byte_unit::Byte;
 use either::Either;
 use flate2::read::GzDecoder;
 use futures::{stream, FutureExt, StreamExt};
-use heed::EnvOpenOptions;
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
+use milli::heed::EnvOpenOptions;
 use milli::tokenizer::TokenizerBuilder;
 use milli::update::UpdateIndexingStep::*;
 use milli::update::{
--- a/http-ui/src/update_store.rs
+++ b/http-ui/src/update_store.rs
@ -6,6 +6,7 @@ use std::sync::Arc;
 use crossbeam_channel::Sender;
 use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
 use heed::{Database, Env, EnvOpenOptions};
+use milli::heed;
 use serde::{Deserialize, Serialize};

 pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@ -9,7 +9,6 @@ publish = false
 anyhow = "1.0.56"
 byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
 csv = "1.1.6"
-heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
 milli = { path = "../milli" }
 mimalloc = { version = "0.1.29", default-features = false }
 roaring = "0.9.0"
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@ -7,7 +7,7 @@ use byte_unit::Byte;
 use heed::EnvOpenOptions;
 use milli::facet::FacetType;
 use milli::index::db_name::*;
-use milli::{FieldId, Index};
+use milli::{heed, FieldId, Index};
 use structopt::StructOpt;
 use Command::*;

--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -18,7 +18,7 @@ fst = "0.4.7"
 fxhash = "0.2.1"
 geoutils = "0.4.1"
 grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
-heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
+heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 memmap2 = "0.5.3"
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
        }
    )]
    InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
+    #[error("{}", HeedError::BadOpenOptions)]
+    InvalidLmdbOpenOptions,
    #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")]
    SortRankingRuleMissing,
    #[error("The database file is in an invalid state.")]
@ -244,6 +246,7 @@ impl From<HeedError> for Error {
            HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })),
            HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
            HeedError::DatabaseClosing => InternalError(DatabaseClosing),
+            HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions),
        }
    }
 }
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -223,6 +223,16 @@ impl Index {
        self.env.path()
    }

+    /// Returns the size used by the index without the cached pages.
+    pub fn used_size(&self) -> Result<u64> {
+        Ok(self.env.non_free_pages_size()?)
+    }
+
+    /// Returns the real size used by the index.
+    pub fn on_disk_size(&self) -> Result<u64> {
+        Ok(self.env.real_disk_size()?)
+    }
+
    pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> {
        self.env.copy_to_path(path, option).map_err(Into::into)
    }
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -20,10 +20,6 @@ use crate::{
    RoaringBitmapCodec, SmallString32, BEU32,
 };

-/// The threshold we use to determine after which number of documents we want to clear the
-/// soft-deleted database and delete documents for real.
-const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000;
-
 pub struct DeleteDocuments<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {

        // if we have less documents to delete than the threshold we simply save them in
        // the `soft_deleted_documents_ids` bitmap and early exit.
-        if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD {
+        let size_used = self.index.used_size()?;
+        let map_size = self.index.env.map_size()? as u64;
+        let nb_documents = self.index.number_of_documents(&self.wtxn)?;
+        let nb_soft_deleted = soft_deleted_docids.len();
+
+        let percentage_available = 100 - (size_used * 100 / map_size);
+        let estimated_document_size = size_used / (nb_documents + nb_soft_deleted);
+        let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted;
+        let percentage_used_by_soft_deleted_documents =
+            estimated_size_used_by_soft_deleted * 100 / map_size;
+
+        // if we have more than 10% of disk space available and the soft deleted
+        // documents uses less than 10% of the total space available,
+        // we skip the deletion. Eg.
+        // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents
+        //   We don’t delete anything.
+        // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents
+        //   We run the deletion.
+        // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents
+        //   We run the deletion.
+        if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 {
            self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
            return Ok(DocumentDeletionResult {
                deleted_documents: self.to_delete_docids.len(),
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -278,27 +278,30 @@ where
        let stop_words = self.index.stop_words(self.wtxn)?;
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;

-        // Run extraction pipeline in parallel.
-        pool.install(|| {
-            let params = GrenadParameters {
+        let pool_params = GrenadParameters {
            chunk_compression_type: self.indexer_config.chunk_compression_type,
            chunk_compression_level: self.indexer_config.chunk_compression_level,
            max_memory: self.indexer_config.max_memory,
            max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
        };
+        let documents_chunk_size =
+            self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
+        let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;

+        // Run extraction pipeline in parallel.
+        pool.install(|| {
            // split obkv file into several chunks
            let original_chunk_iter = grenad_obkv_into_chunks(
                original_documents,
-                params.clone(),
-                self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
+                pool_params.clone(),
+                documents_chunk_size,
            );

            // split obkv file into several chunks
            let flattened_chunk_iter = grenad_obkv_into_chunks(
                flattened_documents,
-                params.clone(),
-                self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
+                pool_params.clone(),
+                documents_chunk_size,
            );

            let result = original_chunk_iter
@ -308,14 +311,14 @@ where
                    extract::data_from_obkv_documents(
                        original_chunk,
                        flattened_chunk,
-                        params,
+                        pool_params,
                        lmdb_writer_sx.clone(),
                        searchable_fields,
                        faceted_fields,
                        primary_key_id,
                        geo_fields_ids,
                        stop_words,
-                        self.indexer_config.max_positions_per_attributes,
+                        max_positions_per_attributes,
                        exact_attributes,
                    )
                });