From e78b96a65767a0b791064a2e20afb92d3163b1ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 12:16:01 +0100 Subject: [PATCH] Introduce a more detailed progress status enum --- src/update/mod.rs | 2 ++ src/update/update_step.rs | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/update/update_step.rs diff --git a/src/update/mod.rs b/src/update/mod.rs index 3582820b4..75724269a 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -4,6 +4,7 @@ mod delete_documents; mod index_documents; mod settings; mod update_builder; +mod update_step; mod update_store; pub use self::available_documents_ids::AvailableDocumentsIds; @@ -12,4 +13,5 @@ pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; +pub use self::update_step::UpdateIndexingStep; pub use self::update_store::UpdateStore; diff --git a/src/update/update_step.rs b/src/update/update_step.rs new file mode 100644 index 000000000..691c9ec2a --- /dev/null +++ b/src/update/update_step.rs @@ -0,0 +1,36 @@ +use UpdateIndexingStep::*; + +#[derive(Debug, Clone, Copy)] +pub enum UpdateIndexingStep { + /// Transform from the original user given format (CSV, JSON, JSON lines) + /// into a generic format based on the obkv and grenad crates. This step also + /// deduplicate potential documents in this batch update by merging or replacing them. + TransformFromUserIntoGenericFormat { documents_seen: usize }, + + /// This step check the external document id, computes the internal ids and merge + /// the documents that are already present in the database. + ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize }, + + /// Extract the documents words using the tokenizer and compute the documents + /// facets. Stores those words, facets and documents ids on disk. + IndexDocuments { documents_seen: usize, total_documents: usize }, + + /// Merge the previously extracted data (words and facets) into the final LMDB database. + /// These extracted data are split into multiple databases. + MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize }, +} + +impl UpdateIndexingStep { + pub const fn step_index(&self) -> usize { + match self { + TransformFromUserIntoGenericFormat { .. } => 0, + ComputeIdsAndMergeDocuments { .. } => 1, + IndexDocuments { .. } => 2, + MergeDataIntoFinalDatabase { .. } => 3, + } + } + + pub const fn number_of_steps(&self) -> usize { + 4 + } +}