diff --git a/http-ui/public/updates-script.js b/http-ui/public/updates-script.js
index c034c45b8..5d439a7f5 100644
--- a/http-ui/public/updates-script.js
+++ b/http-ui/public/updates-script.js
@@ -54,13 +54,22 @@ $(window).on('load', function () {
const content = $(`#${id} .updateStatus.content`);
let html;
- let { type, processed_number_of_documents, total_number_of_documents } = status.meta;
- if (type === 'DocumentsAddition' && processed_number_of_documents && total_number_of_documents) {
- let progress = Math.round(processed_number_of_documents / total_number_of_documents * 100);
+
+ let { type, step, total_steps, current, total } = status.meta;
+
+ if (type === 'DocumentsAddition') {
+ // If the total is null or undefined then the progress results is infinity.
+ let progress = Math.round(current / total * 100);
+ // We must divide the progress by the total number of indexing steps.
+ progress = progress / total_steps;
+ // And mark the previous steps as processed.
+ progress = progress + (step * 100 / total_steps);
+ // Generate the appropriate html bulma progress bar.
html = ``;
} else {
html = ``;
}
+
content.html(html);
}
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 281eb0c8e..2af2c9d90 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -26,6 +26,7 @@ use warp::filters::ws::Message;
use warp::{Filter, http::Response};
use milli::tokenizer::{simple_tokenizer, TokenType};
+use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
@@ -201,8 +202,10 @@ enum UpdateMeta {
#[serde(tag = "type")]
enum UpdateMetaProgress {
DocumentsAddition {
- processed_number_of_documents: usize,
- total_number_of_documents: Option,
+ step: usize,
+ total_steps: usize,
+ current: usize,
+ total: Option,
},
}
@@ -310,12 +313,20 @@ async fn main() -> anyhow::Result<()> {
Box::new(content) as Box
};
- let result = builder.execute(reader, |count, total| {
+ let result = builder.execute(reader, |indexing_step| {
+ let (current, total) = match indexing_step {
+ TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
+ ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+ IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+ MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
+ };
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id,
meta: UpdateMetaProgress::DocumentsAddition {
- processed_number_of_documents: count,
- total_number_of_documents: Some(total),
+ step: indexing_step.step(),
+ total_steps: indexing_step.number_of_steps(),
+ current,
+ total,
}
});
});
@@ -356,12 +367,20 @@ async fn main() -> anyhow::Result<()> {
}
}
- let result = builder.execute(|count, total| {
+ let result = builder.execute(|indexing_step| {
+ let (current, total) = match indexing_step {
+ TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
+ ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+ IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+ MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
+ };
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id,
meta: UpdateMetaProgress::DocumentsAddition {
- processed_number_of_documents: count,
- total_number_of_documents: Some(total),
+ step: indexing_step.step(),
+ total_steps: indexing_step.number_of_steps(),
+ current,
+ total,
}
});
});
diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs
index ad542d0f0..f078ec9ed 100644
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@@ -15,6 +15,7 @@ use rayon::prelude::*;
use rayon::ThreadPool;
use crate::index::Index;
+use crate::update::UpdateIndexingStep;
use self::store::Store;
use self::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
@@ -249,13 +250,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result<()>
where
R: io::Read,
- F: Fn(usize, usize) + Sync,
+ F: Fn(UpdateIndexingStep) + Sync,
{
let before_transform = Instant::now();
let transform = Transform {
rtxn: &self.wtxn,
index: self.index,
+ log_every_n: self.log_every_n,
chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level,
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@@ -266,9 +268,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
};
let output = match self.update_format {
- UpdateFormat::Csv => transform.from_csv(reader)?,
- UpdateFormat::Json => transform.from_json(reader)?,
- UpdateFormat::JsonStream => transform.from_json_stream(reader)?,
+ UpdateFormat::Csv => transform.from_csv(reader, &progress_callback)?,
+ UpdateFormat::Json => transform.from_json(reader, &progress_callback)?,
+ UpdateFormat::JsonStream => transform.from_json_stream(reader, &progress_callback)?,
};
info!("Update transformed in {:.02?}", before_transform.elapsed());
@@ -278,7 +280,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()>
where
- F: Fn(usize, usize) + Sync
+ F: Fn(UpdateIndexingStep) + Sync
{
let before_indexing = Instant::now();
@@ -460,6 +462,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
documents_ids.union_with(&replaced_documents_ids);
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
+ let mut database_count = 0;
+ progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+ databases_seen: 0,
+ total_databases: 5,
+ });
+
debug!("Writing the docid word positions into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
@@ -469,6 +477,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method
)?;
+ database_count += 1;
+ progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+ databases_seen: database_count,
+ total_databases: 5,
+ });
+
debug!("Writing the documents into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
@@ -478,6 +492,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method
)?;
+ database_count += 1;
+ progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+ databases_seen: database_count,
+ total_databases: 5,
+ });
+
debug!("Writing the words pairs proximities docids into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
@@ -487,6 +507,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method,
)?;
+ database_count += 1;
+ progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+ databases_seen: database_count,
+ total_databases: 5,
+ });
+
for (db_type, result) in receiver {
let content = result?;
match db_type {
@@ -512,8 +538,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
)?;
},
}
+
+ database_count += 1;
+ progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+ databases_seen: database_count,
+ total_databases: 5,
+ });
}
+ debug_assert_eq!(database_count, 5);
+
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
Ok(())
@@ -537,7 +571,7 @@ mod tests {
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
@@ -551,7 +585,7 @@ mod tests {
let content = &b"id,name\n1,updated kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 3 documents.
@@ -565,7 +599,7 @@ mod tests {
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 3 documents.
@@ -589,7 +623,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is only 1 document now.
@@ -616,7 +650,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 1 document.
@@ -652,7 +686,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.disable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
- assert!(builder.execute(content, |_, _| ()).is_err());
+ assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap();
// Check that there is no document.
@@ -679,7 +713,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.disable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
- assert!(builder.execute(content, |_, _| ()).is_err());
+ assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap();
// Check that there is no document.
@@ -701,7 +735,7 @@ mod tests {
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
@@ -719,7 +753,7 @@ mod tests {
let content = format!("id,name\n{},updated kevin", kevin_uuid);
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content.as_bytes(), |_, _| ()).unwrap();
+ builder.execute(content.as_bytes(), |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 3 documents.
@@ -754,7 +788,7 @@ mod tests {
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
@@ -768,7 +802,7 @@ mod tests {
let content = &b"name\nnew kevin"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 4 documents now.
@@ -790,7 +824,7 @@ mod tests {
let content = &b"id,name\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is no documents.
@@ -816,7 +850,7 @@ mod tests {
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
@@ -838,7 +872,7 @@ mod tests {
let content = &b"[]"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is no documents.
@@ -864,7 +898,7 @@ mod tests {
"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::JsonStream);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
@@ -887,7 +921,7 @@ mod tests {
let content = &b"id,name\nbrume bleue,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- assert!(builder.execute(content, |_, _| ()).is_err());
+ assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap();
// First we send 1 document with a valid id.
@@ -896,7 +930,7 @@ mod tests {
let content = &b"id,name\n32,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 1 document now.
@@ -922,7 +956,7 @@ mod tests {
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 1 documents now.
diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs
index 1503ef5fb..6d42f0119 100644
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@@ -16,6 +16,7 @@ use tempfile::tempfile;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token};
+use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
@@ -294,7 +295,7 @@ impl Store {
log_every_n: Option,
mut progress_callback: F,
) -> anyhow::Result
- where F: FnMut(usize, usize),
+ where F: FnMut(UpdateIndexingStep),
{
debug!("{:?}: Indexing in a Store...", thread_index);
@@ -311,7 +312,10 @@ impl Store {
// This is a log routine that we do every `log_every_n` documents.
if log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
- progress_callback(count, documents_count);
+ progress_callback(UpdateIndexingStep::IndexDocuments {
+ documents_seen: count,
+ total_documents: documents_count,
+ });
before = Instant::now();
}
@@ -343,7 +347,10 @@ impl Store {
count = count + 1;
}
- progress_callback(count, documents_count);
+ progress_callback(UpdateIndexingStep::IndexDocuments {
+ documents_seen: count,
+ total_documents: documents_count,
+ });
let readers = self.finish()?;
debug!("{:?}: Store created!", thread_index);
diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs
index cb583849a..9847998a5 100644
--- a/src/update/index_documents/transform.rs
+++ b/src/update/index_documents/transform.rs
@@ -11,7 +11,7 @@ use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use crate::{BEU32, MergeFn, Index, FieldsIdsMap};
-use crate::update::AvailableDocumentsIds;
+use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod};
@@ -34,6 +34,7 @@ pub struct TransformOutput {
pub struct Transform<'t, 'i> {
pub rtxn: &'t heed::RoTxn<'i>,
pub index: &'i Index,
+ pub log_every_n: Option,
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option,
pub chunk_fusing_shrink_size: Option,
@@ -44,15 +45,32 @@ pub struct Transform<'t, 'i> {
}
impl Transform<'_, '_> {
- pub fn from_json(self, reader: R) -> anyhow::Result {
- self.from_generic_json(reader, false)
+ pub fn from_json(self, reader: R, progress_callback: F) -> anyhow::Result
+ where
+ R: Read,
+ F: Fn(UpdateIndexingStep) + Sync,
+ {
+ self.from_generic_json(reader, false, progress_callback)
}
- pub fn from_json_stream(self, reader: R) -> anyhow::Result {
- self.from_generic_json(reader, true)
+ pub fn from_json_stream(self, reader: R, progress_callback: F) -> anyhow::Result
+ where
+ R: Read,
+ F: Fn(UpdateIndexingStep) + Sync,
+ {
+ self.from_generic_json(reader, true, progress_callback)
}
- fn from_generic_json(self, reader: R, is_stream: bool) -> anyhow::Result {
+ fn from_generic_json(
+ self,
+ reader: R,
+ is_stream: bool,
+ progress_callback: F,
+ ) -> anyhow::Result
+ where
+ R: Read,
+ F: Fn(UpdateIndexingStep) + Sync,
+ {
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
let primary_key = self.index.primary_key(self.rtxn)?;
@@ -131,10 +149,17 @@ impl Transform<'_, '_> {
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
+ let mut documents_count = 0;
for result in documents {
let document = result?;
+ if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+ progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+ documents_seen: documents_count,
+ });
+ }
+
obkv_buffer.clear();
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
@@ -186,14 +211,30 @@ impl Transform<'_, '_> {
// We use the extracted/generated user id as the key for this document.
sorter.insert(user_id.as_bytes(), &obkv_buffer)?;
+ documents_count += 1;
}
+ progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+ documents_seen: documents_count,
+ });
+
// Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput.
- self.from_sorter(sorter, primary_key, fields_ids_map, users_ids_documents_ids)
+ self.from_sorter(
+ sorter,
+ primary_key,
+ fields_ids_map,
+ documents_count,
+ users_ids_documents_ids,
+ progress_callback,
+ )
}
- pub fn from_csv(self, reader: R) -> anyhow::Result {
+ pub fn from_csv(self, reader: R, progress_callback: F) -> anyhow::Result
+ where
+ R: Read,
+ F: Fn(UpdateIndexingStep) + Sync,
+ {
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
@@ -255,12 +296,19 @@ impl Transform<'_, '_> {
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
+ let mut documents_count = 0;
let mut record = csv::StringRecord::new();
while csv.read_record(&mut record)? {
obkv_buffer.clear();
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
+ if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+ progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+ documents_seen: documents_count,
+ });
+ }
+
// We extract the user id if we know where it is or generate an UUID V4 otherwise.
let user_id = match user_id_pos {
Some(pos) => {
@@ -292,23 +340,39 @@ impl Transform<'_, '_> {
// We use the extracted/generated user id as the key for this document.
sorter.insert(user_id, &obkv_buffer)?;
+ documents_count += 1;
}
+ progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+ documents_seen: documents_count,
+ });
+
// Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput.
- self.from_sorter(sorter, primary_key_field_id, fields_ids_map, users_ids_documents_ids)
+ self.from_sorter(
+ sorter,
+ primary_key_field_id,
+ fields_ids_map,
+ documents_count,
+ users_ids_documents_ids,
+ progress_callback,
+ )
}
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
/// id for the user side and the value must be an obkv where keys are valid fields ids.
- fn from_sorter(
+ fn from_sorter(
self,
sorter: grenad::Sorter,
primary_key: u8,
fields_ids_map: FieldsIdsMap,
+ approximate_number_of_documents: usize,
users_ids_documents_ids: fst::Map>,
+ progress_callback: F,
) -> anyhow::Result
+ where
+ F: Fn(UpdateIndexingStep) + Sync,
{
let documents_ids = self.index.documents_ids(self.rtxn)?;
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
@@ -332,6 +396,13 @@ impl Transform<'_, '_> {
let mut iter = sorter.into_iter()?;
while let Some((user_id, update_obkv)) = iter.next()? {
+ if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+ progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
+ documents_seen: documents_count,
+ total_documents: approximate_number_of_documents,
+ });
+ }
+
let (docid, obkv) = match users_ids_documents_ids.get(user_id) {
Some(docid) => {
// If we find the user id in the current users ids documents ids map
@@ -369,6 +440,11 @@ impl Transform<'_, '_> {
documents_count += 1;
}
+ progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
+ documents_seen: documents_count,
+ total_documents: documents_count,
+ });
+
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
diff --git a/src/update/mod.rs b/src/update/mod.rs
index 3582820b4..75724269a 100644
--- a/src/update/mod.rs
+++ b/src/update/mod.rs
@@ -4,6 +4,7 @@ mod delete_documents;
mod index_documents;
mod settings;
mod update_builder;
+mod update_step;
mod update_store;
pub use self::available_documents_ids::AvailableDocumentsIds;
@@ -12,4 +13,5 @@ pub use self::delete_documents::DeleteDocuments;
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
pub use self::settings::Settings;
pub use self::update_builder::UpdateBuilder;
+pub use self::update_step::UpdateIndexingStep;
pub use self::update_store::UpdateStore;
diff --git a/src/update/settings.rs b/src/update/settings.rs
index 427f0e926..ad191ceec 100644
--- a/src/update/settings.rs
+++ b/src/update/settings.rs
@@ -3,7 +3,7 @@ use grenad::CompressionType;
use rayon::ThreadPool;
use crate::update::index_documents::{Transform, IndexDocumentsMethod};
-use crate::update::{ClearDocuments, IndexDocuments};
+use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::{Index, FieldsIdsMap};
pub struct Settings<'a, 't, 'u, 'i> {
@@ -60,7 +60,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
pub fn execute(self, progress_callback: F) -> anyhow::Result<()>
where
- F: Fn(usize, usize) + Sync
+ F: Fn(UpdateIndexingStep) + Sync
{
// Check that the searchable attributes have been specified.
if let Some(value) = self.searchable_fields {
@@ -126,6 +126,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let transform = Transform {
rtxn: &self.wtxn,
index: self.index,
+ log_every_n: self.log_every_n,
chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level,
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@@ -231,14 +232,14 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// We change the searchable fields to be the "name" field only.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_searchable_fields(vec!["name".into()]);
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the searchable field is correctly set to "name" only.
@@ -260,7 +261,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_searchable_fields();
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the searchable field have been reset and documents are found now.
@@ -286,7 +287,7 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// In the same transaction we change the displayed fields to be only the "age".
@@ -295,7 +296,7 @@ mod tests {
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_displayed_fields(vec!["age".into()]);
builder.set_searchable_fields(vec!["name".into()]);
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value).
@@ -310,7 +311,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_searchable_fields();
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields always contains only the "age" field.
@@ -334,7 +335,7 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value).
@@ -356,12 +357,12 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
- builder.execute(content, |_, _| ()).unwrap();
+ builder.execute(content, |_| ()).unwrap();
// In the same transaction we change the displayed fields to be only the age.
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_displayed_fields(vec!["age".into()]);
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to only the "age" field.
@@ -376,7 +377,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_displayed_fields();
- builder.execute(|_, _| ()).unwrap();
+ builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value).
diff --git a/src/update/update_step.rs b/src/update/update_step.rs
new file mode 100644
index 000000000..68a32bb67
--- /dev/null
+++ b/src/update/update_step.rs
@@ -0,0 +1,36 @@
+use UpdateIndexingStep::*;
+
+#[derive(Debug, Clone, Copy)]
+pub enum UpdateIndexingStep {
+ /// Transform from the original user given format (CSV, JSON, JSON lines)
+ /// into a generic format based on the obkv and grenad crates. This step also
+ /// deduplicate potential documents in this batch update by merging or replacing them.
+ TransformFromUserIntoGenericFormat { documents_seen: usize },
+
+ /// This step check the external document id, computes the internal ids and merge
+ /// the documents that are already present in the database.
+ ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize },
+
+ /// Extract the documents words using the tokenizer and compute the documents
+ /// facets. Stores those words, facets and documents ids on disk.
+ IndexDocuments { documents_seen: usize, total_documents: usize },
+
+ /// Merge the previously extracted data (words and facets) into the final LMDB database.
+ /// These extracted data are split into multiple databases.
+ MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize },
+}
+
+impl UpdateIndexingStep {
+ pub const fn step(&self) -> usize {
+ match self {
+ TransformFromUserIntoGenericFormat { .. } => 0,
+ ComputeIdsAndMergeDocuments { .. } => 1,
+ IndexDocuments { .. } => 2,
+ MergeDataIntoFinalDatabase { .. } => 3,
+ }
+ }
+
+ pub const fn number_of_steps(&self) -> usize {
+ 4
+ }
+}