diff --git a/http-ui/public/updates-script.js b/http-ui/public/updates-script.js index c034c45b8..5d439a7f5 100644 --- a/http-ui/public/updates-script.js +++ b/http-ui/public/updates-script.js @@ -54,13 +54,22 @@ $(window).on('load', function () { const content = $(`#${id} .updateStatus.content`); let html; - let { type, processed_number_of_documents, total_number_of_documents } = status.meta; - if (type === 'DocumentsAddition' && processed_number_of_documents && total_number_of_documents) { - let progress = Math.round(processed_number_of_documents / total_number_of_documents * 100); + + let { type, step, total_steps, current, total } = status.meta; + + if (type === 'DocumentsAddition') { + // If the total is null or undefined then the progress results is infinity. + let progress = Math.round(current / total * 100); + // We must divide the progress by the total number of indexing steps. + progress = progress / total_steps; + // And mark the previous steps as processed. + progress = progress + (step * 100 / total_steps); + // Generate the appropriate html bulma progress bar. html = ``; } else { html = ``; } + content.html(html); } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 281eb0c8e..2af2c9d90 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -26,6 +26,7 @@ use warp::filters::ws::Message; use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; +use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; @@ -201,8 +202,10 @@ enum UpdateMeta { #[serde(tag = "type")] enum UpdateMetaProgress { DocumentsAddition { - processed_number_of_documents: usize, - total_number_of_documents: Option, + step: usize, + total_steps: usize, + current: usize, + total: Option, }, } @@ -310,12 +313,20 @@ async fn main() -> anyhow::Result<()> { Box::new(content) as Box }; - let result = builder.execute(reader, |count, total| { + let result = builder.execute(reader, |indexing_step| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), + IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), + MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + }; let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { update_id, meta: UpdateMetaProgress::DocumentsAddition { - processed_number_of_documents: count, - total_number_of_documents: Some(total), + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, } }); }); @@ -356,12 +367,20 @@ async fn main() -> anyhow::Result<()> { } } - let result = builder.execute(|count, total| { + let result = builder.execute(|indexing_step| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), + IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), + MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + }; let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { update_id, meta: UpdateMetaProgress::DocumentsAddition { - processed_number_of_documents: count, - total_number_of_documents: Some(total), + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, } }); }); diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index ad542d0f0..f078ec9ed 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -15,6 +15,7 @@ use rayon::prelude::*; use rayon::ThreadPool; use crate::index::Index; +use crate::update::UpdateIndexingStep; use self::store::Store; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, @@ -249,13 +250,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result<()> where R: io::Read, - F: Fn(usize, usize) + Sync, + F: Fn(UpdateIndexingStep) + Sync, { let before_transform = Instant::now(); let transform = Transform { rtxn: &self.wtxn, index: self.index, + log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, @@ -266,9 +268,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }; let output = match self.update_format { - UpdateFormat::Csv => transform.from_csv(reader)?, - UpdateFormat::Json => transform.from_json(reader)?, - UpdateFormat::JsonStream => transform.from_json_stream(reader)?, + UpdateFormat::Csv => transform.from_csv(reader, &progress_callback)?, + UpdateFormat::Json => transform.from_json(reader, &progress_callback)?, + UpdateFormat::JsonStream => transform.from_json_stream(reader, &progress_callback)?, }; info!("Update transformed in {:.02?}", before_transform.elapsed()); @@ -278,7 +280,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> where - F: Fn(usize, usize) + Sync + F: Fn(UpdateIndexingStep) + Sync { let before_indexing = Instant::now(); @@ -460,6 +462,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { documents_ids.union_with(&replaced_documents_ids); self.index.put_documents_ids(self.wtxn, &documents_ids)?; + let mut database_count = 0; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: 0, + total_databases: 5, + }); + debug!("Writing the docid word positions into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, @@ -469,6 +477,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method )?; + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases: 5, + }); + debug!("Writing the documents into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, @@ -478,6 +492,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method )?; + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases: 5, + }); + debug!("Writing the words pairs proximities docids into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, @@ -487,6 +507,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases: 5, + }); + for (db_type, result) in receiver { let content = result?; match db_type { @@ -512,8 +538,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { )?; }, } + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases: 5, + }); } + debug_assert_eq!(database_count, 5); + info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); Ok(()) @@ -537,7 +571,7 @@ mod tests { let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -551,7 +585,7 @@ mod tests { let content = &b"id,name\n1,updated kevin\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -565,7 +599,7 @@ mod tests { let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -589,7 +623,7 @@ mod tests { let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is only 1 document now. @@ -616,7 +650,7 @@ mod tests { let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 1 document. @@ -652,7 +686,7 @@ mod tests { let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_, _| ()).is_err()); + assert!(builder.execute(content, |_| ()).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -679,7 +713,7 @@ mod tests { let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); - assert!(builder.execute(content, |_, _| ()).is_err()); + assert!(builder.execute(content, |_| ()).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -701,7 +735,7 @@ mod tests { let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -719,7 +753,7 @@ mod tests { let content = format!("id,name\n{},updated kevin", kevin_uuid); let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + builder.execute(content.as_bytes(), |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -754,7 +788,7 @@ mod tests { let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -768,7 +802,7 @@ mod tests { let content = &b"name\nnew kevin"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 documents now. @@ -790,7 +824,7 @@ mod tests { let content = &b"id,name\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -816,7 +850,7 @@ mod tests { ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -838,7 +872,7 @@ mod tests { let content = &b"[]"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -864,7 +898,7 @@ mod tests { "#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::JsonStream); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -887,7 +921,7 @@ mod tests { let content = &b"id,name\nbrume bleue,kevin\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_, _| ()).is_err()); + assert!(builder.execute(content, |_| ()).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. @@ -896,7 +930,7 @@ mod tests { let content = &b"id,name\n32,kevin\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 document now. @@ -922,7 +956,7 @@ mod tests { ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 documents now. diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 1503ef5fb..6d42f0119 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -16,6 +16,7 @@ use tempfile::tempfile; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::tokenizer::{simple_tokenizer, only_token}; +use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec32, Position, DocumentId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; @@ -294,7 +295,7 @@ impl Store { log_every_n: Option, mut progress_callback: F, ) -> anyhow::Result - where F: FnMut(usize, usize), + where F: FnMut(UpdateIndexingStep), { debug!("{:?}: Indexing in a Store...", thread_index); @@ -311,7 +312,10 @@ impl Store { // This is a log routine that we do every `log_every_n` documents. if log_every_n.map_or(false, |len| count % len == 0) { info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); - progress_callback(count, documents_count); + progress_callback(UpdateIndexingStep::IndexDocuments { + documents_seen: count, + total_documents: documents_count, + }); before = Instant::now(); } @@ -343,7 +347,10 @@ impl Store { count = count + 1; } - progress_callback(count, documents_count); + progress_callback(UpdateIndexingStep::IndexDocuments { + documents_seen: count, + total_documents: documents_count, + }); let readers = self.finish()?; debug!("{:?}: Store created!", thread_index); diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index cb583849a..9847998a5 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -11,7 +11,7 @@ use roaring::RoaringBitmap; use serde_json::{Map, Value}; use crate::{BEU32, MergeFn, Index, FieldsIdsMap}; -use crate::update::AvailableDocumentsIds; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -34,6 +34,7 @@ pub struct TransformOutput { pub struct Transform<'t, 'i> { pub rtxn: &'t heed::RoTxn<'i>, pub index: &'i Index, + pub log_every_n: Option, pub chunk_compression_type: CompressionType, pub chunk_compression_level: Option, pub chunk_fusing_shrink_size: Option, @@ -44,15 +45,32 @@ pub struct Transform<'t, 'i> { } impl Transform<'_, '_> { - pub fn from_json(self, reader: R) -> anyhow::Result { - self.from_generic_json(reader, false) + pub fn from_json(self, reader: R, progress_callback: F) -> anyhow::Result + where + R: Read, + F: Fn(UpdateIndexingStep) + Sync, + { + self.from_generic_json(reader, false, progress_callback) } - pub fn from_json_stream(self, reader: R) -> anyhow::Result { - self.from_generic_json(reader, true) + pub fn from_json_stream(self, reader: R, progress_callback: F) -> anyhow::Result + where + R: Read, + F: Fn(UpdateIndexingStep) + Sync, + { + self.from_generic_json(reader, true, progress_callback) } - fn from_generic_json(self, reader: R, is_stream: bool) -> anyhow::Result { + fn from_generic_json( + self, + reader: R, + is_stream: bool, + progress_callback: F, + ) -> anyhow::Result + where + R: Read, + F: Fn(UpdateIndexingStep) + Sync, + { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); let primary_key = self.index.primary_key(self.rtxn)?; @@ -131,10 +149,17 @@ impl Transform<'_, '_> { let mut json_buffer = Vec::new(); let mut obkv_buffer = Vec::new(); let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut documents_count = 0; for result in documents { let document = result?; + if self.log_every_n.map_or(false, |len| documents_count % len == 0) { + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + documents_seen: documents_count, + }); + } + obkv_buffer.clear(); let mut writer = obkv::KvWriter::new(&mut obkv_buffer); @@ -186,14 +211,30 @@ impl Transform<'_, '_> { // We use the extracted/generated user id as the key for this document. sorter.insert(user_id.as_bytes(), &obkv_buffer)?; + documents_count += 1; } + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + documents_seen: documents_count, + }); + // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. - self.from_sorter(sorter, primary_key, fields_ids_map, users_ids_documents_ids) + self.from_sorter( + sorter, + primary_key, + fields_ids_map, + documents_count, + users_ids_documents_ids, + progress_callback, + ) } - pub fn from_csv(self, reader: R) -> anyhow::Result { + pub fn from_csv(self, reader: R, progress_callback: F) -> anyhow::Result + where + R: Read, + F: Fn(UpdateIndexingStep) + Sync, + { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); @@ -255,12 +296,19 @@ impl Transform<'_, '_> { let mut json_buffer = Vec::new(); let mut obkv_buffer = Vec::new(); let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut documents_count = 0; let mut record = csv::StringRecord::new(); while csv.read_record(&mut record)? { obkv_buffer.clear(); let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + if self.log_every_n.map_or(false, |len| documents_count % len == 0) { + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + documents_seen: documents_count, + }); + } + // We extract the user id if we know where it is or generate an UUID V4 otherwise. let user_id = match user_id_pos { Some(pos) => { @@ -292,23 +340,39 @@ impl Transform<'_, '_> { // We use the extracted/generated user id as the key for this document. sorter.insert(user_id, &obkv_buffer)?; + documents_count += 1; } + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + documents_seen: documents_count, + }); + // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. - self.from_sorter(sorter, primary_key_field_id, fields_ids_map, users_ids_documents_ids) + self.from_sorter( + sorter, + primary_key_field_id, + fields_ids_map, + documents_count, + users_ids_documents_ids, + progress_callback, + ) } /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn from_sorter( + fn from_sorter( self, sorter: grenad::Sorter, primary_key: u8, fields_ids_map: FieldsIdsMap, + approximate_number_of_documents: usize, users_ids_documents_ids: fst::Map>, + progress_callback: F, ) -> anyhow::Result + where + F: Fn(UpdateIndexingStep) + Sync, { let documents_ids = self.index.documents_ids(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); @@ -332,6 +396,13 @@ impl Transform<'_, '_> { let mut iter = sorter.into_iter()?; while let Some((user_id, update_obkv)) = iter.next()? { + if self.log_every_n.map_or(false, |len| documents_count % len == 0) { + progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { + documents_seen: documents_count, + total_documents: approximate_number_of_documents, + }); + } + let (docid, obkv) = match users_ids_documents_ids.get(user_id) { Some(docid) => { // If we find the user id in the current users ids documents ids map @@ -369,6 +440,11 @@ impl Transform<'_, '_> { documents_count += 1; } + progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { + documents_seen: documents_count, + total_documents: documents_count, + }); + // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; diff --git a/src/update/mod.rs b/src/update/mod.rs index 3582820b4..75724269a 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -4,6 +4,7 @@ mod delete_documents; mod index_documents; mod settings; mod update_builder; +mod update_step; mod update_store; pub use self::available_documents_ids::AvailableDocumentsIds; @@ -12,4 +13,5 @@ pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; +pub use self::update_step::UpdateIndexingStep; pub use self::update_store::UpdateStore; diff --git a/src/update/settings.rs b/src/update/settings.rs index 427f0e926..ad191ceec 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -3,7 +3,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::update::index_documents::{Transform, IndexDocumentsMethod}; -use crate::update::{ClearDocuments, IndexDocuments}; +use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{Index, FieldsIdsMap}; pub struct Settings<'a, 't, 'u, 'i> { @@ -60,7 +60,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where - F: Fn(usize, usize) + Sync + F: Fn(UpdateIndexingStep) + Sync { // Check that the searchable attributes have been specified. if let Some(value) = self.searchable_fields { @@ -126,6 +126,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let transform = Transform { rtxn: &self.wtxn, index: self.index, + log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, @@ -231,14 +232,14 @@ mod tests { let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field is correctly set to "name" only. @@ -260,7 +261,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index); builder.reset_searchable_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field have been reset and documents are found now. @@ -286,7 +287,7 @@ mod tests { let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // In the same transaction we change the displayed fields to be only the "age". @@ -295,7 +296,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["age".into()]); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -310,7 +311,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index); builder.reset_searchable_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields always contains only the "age" field. @@ -334,7 +335,7 @@ mod tests { let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -356,12 +357,12 @@ mod tests { let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); // In the same transaction we change the displayed fields to be only the age. let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["age".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to only the "age" field. @@ -376,7 +377,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index); builder.reset_displayed_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). diff --git a/src/update/update_step.rs b/src/update/update_step.rs new file mode 100644 index 000000000..68a32bb67 --- /dev/null +++ b/src/update/update_step.rs @@ -0,0 +1,36 @@ +use UpdateIndexingStep::*; + +#[derive(Debug, Clone, Copy)] +pub enum UpdateIndexingStep { + /// Transform from the original user given format (CSV, JSON, JSON lines) + /// into a generic format based on the obkv and grenad crates. This step also + /// deduplicate potential documents in this batch update by merging or replacing them. + TransformFromUserIntoGenericFormat { documents_seen: usize }, + + /// This step check the external document id, computes the internal ids and merge + /// the documents that are already present in the database. + ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize }, + + /// Extract the documents words using the tokenizer and compute the documents + /// facets. Stores those words, facets and documents ids on disk. + IndexDocuments { documents_seen: usize, total_documents: usize }, + + /// Merge the previously extracted data (words and facets) into the final LMDB database. + /// These extracted data are split into multiple databases. + MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize }, +} + +impl UpdateIndexingStep { + pub const fn step(&self) -> usize { + match self { + TransformFromUserIntoGenericFormat { .. } => 0, + ComputeIdsAndMergeDocuments { .. } => 1, + IndexDocuments { .. } => 2, + MergeDataIntoFinalDatabase { .. } => 3, + } + } + + pub const fn number_of_steps(&self) -> usize { + 4 + } +}