From 0dd321afc70330f730c8bc89fef9964fa479c2fb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 14 Nov 2024 10:02:51 +0100 Subject: [PATCH 001/158] reproduce #4984 --- crates/meilisearch/tests/search/mod.rs | 74 ++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index d1091d944..1dc406fb3 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -15,6 +15,7 @@ mod pagination; mod restrict_searchable; mod search_queue; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tempfile::TempDir; @@ -62,6 +63,79 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/4984 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = + index.update_settings(json!({"stopWords": ["the", "a", "an", "to", "in", "of"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // prefix search + index + .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + } + ] + "###); + }) + .await; + + // non-prefix search + index + .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "_formatted": { + "title": "Shazam!" + } + }, + { + "title": "Captain Marvel", + "_formatted": { + "title": "Captain Marvel" + } + }, + { + "title": "Escape Room", + "_formatted": { + "title": "Escape Room" + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + }, + { + "title": "Gläss", + "_formatted": { + "title": "Gläss" + } + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 From 72ba35349887e64f0ae69079c8b27de21d141ba8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 10:03:23 +0100 Subject: [PATCH 002/158] reproduce sdk fail --- crates/meilisearch/tests/search/formatted.rs | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 1484b6393..c549cd79d 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -4,6 +4,58 @@ use super::*; use crate::common::Server; use crate::json; +#[actix_rt::test] +async fn search_formatted_from_sdk() { + let server = Server::new_shared(); + let index = server.unique_index(); + + index + .update_settings( + json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }), + ) + .await; + + let documents = json!([ + { "id": 123, "title": "Pride and Prejudice", "genre": "romance" }, + { "id": 456, "title": "Le Petit Prince", "genre": "adventure" }, + { "id": 1, "title": "Alice In Wonderland", "genre": "adventure" }, + { "id": 2, "title": "Le Rouge et le Noir", "genre": "romance" }, + { "id": 1344, "title": "The Hobbit", "genre": "adventure" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "genre": "fantasy" }, + { "id": 7, "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } + ]); + let (response, _) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + index + .search( + json!({ "q":"prince", + "attributesToCrop": ["title"], + "cropLength": 2, + "filter": "genre = adventure", + "attributesToHighlight": ["title"], + "attributesToRetrieve": ["title"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + allow_duplicates! { + assert_json_snapshot!(response["hits"][0], + { "._rankingScore" => "[score]" }, + @r###" + { + "title": "Le Petit Prince", + "_formatted": { + "title": "…Petit Prince" + } + } + "###); + } + }, + ) + .await; +} + #[actix_rt::test] async fn formatted_contain_wildcard() { let server = Server::new_shared(); From 3a8051866afc97af32575806a40adf8c3b9638a0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 11:12:36 +0100 Subject: [PATCH 003/158] Use `return_keyword_results` function instead of returning raw keyword results when the embedder is broken --- crates/milli/src/search/hybrid.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index 8b274804c..90833dfe9 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -205,7 +205,11 @@ impl<'a> Search<'a> { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results( + self.limit, + self.offset, + keyword_results, + )); } } } From cd796b0f4b3323c74190cc862bcd95ab3abec318 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 11:46:00 +0100 Subject: [PATCH 004/158] Fix SDK test --- crates/milli/src/search/new/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80e3ec7b2..c1fb18cfa 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -268,7 +268,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { last_match_last_token_position_plus_one } else { // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - 1 + tokens.len() } }; From e0c3f3d560acd3d1cc67f09656c5594c47e1603f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 16:08:53 +0100 Subject: [PATCH 005/158] Fix #4984 --- crates/meilisearch/tests/search/mod.rs | 16 ++++------------ .../extract/extract_docid_word_positions.rs | 8 ++++---- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 1dc406fb3..f3c11e451 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -69,8 +69,9 @@ async fn search_with_stop_word() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = - index.update_settings(json!({"stopWords": ["the", "a", "an", "to", "in", "of"]})).await; + let (_, code) = index + .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]})) + .await; meili_snap::snapshot!(code, @"202 Accepted"); let documents = DOCUMENTS.clone(); @@ -81,16 +82,7 @@ async fn search_with_stop_word() { index .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { assert_eq!(code, 200, "{}", response); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "How to Train Your Dragon: The Hidden World", - "_formatted": { - "title": "How to Train Your Dragon: The Hidden World" - } - } - ] - "###); + snapshot!(json_string!(response["hits"]), @"[]"); }) .await; diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ba11ceeb3..16ea92fa4 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -57,9 +57,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let del_builder = + let mut del_builder = tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); - let del_tokenizer = del_builder.into_tokenizer(); + let del_tokenizer = del_builder.build(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -69,9 +69,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let add_builder = + let mut add_builder = tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); - let add_tokenizer = add_builder.into_tokenizer(); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; From 8924d486dba1d48be642cd58830cbf7a2f46c515 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 3 Oct 2024 12:04:59 +0200 Subject: [PATCH 006/158] Add a test reproducing the bug --- .../tests/search/restrict_searchable.rs | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index ca659c518..abd13fadf 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -367,3 +367,50 @@ async fn search_on_exact_field() { }) .await; } + +#[actix_rt::test] +async fn phrase_search_on_title() { + let server = Server::new().await; + let documents = json!([ + { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" }, + { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" }, + { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" }, + { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" }, + { "id": 1, "desc": "Document Review", "title": "Document Reviewer" }, + { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" } + ]); + let index = index_with_documents(&server, &documents).await; + + index + .search( + json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review Attorney" + }, + { + "title": "Document Review Manager - Cyber Incident Response (Remote)" + }, + { + "title": "Document Review Paralegal" + }, + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review (Entry Level)" + } + ] + "###); + }, + ) + .await; +} From 510ca999962e898c042c08682b5186ff3a3b1e71 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 12:28:03 +0100 Subject: [PATCH 007/158] Fixes #4974 --- crates/milli/src/search/new/resolve_query_graph.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/new/resolve_query_graph.rs b/crates/milli/src/search/new/resolve_query_graph.rs index 7a47b0a66..4496f8c65 100644 --- a/crates/milli/src/search/new/resolve_query_graph.rs +++ b/crates/milli/src/search/new/resolve_query_graph.rs @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - let mut candidates = RoaringBitmap::new(); + let mut candidates = None; for word in words.iter().flatten().copied() { if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { - candidates |= word_docids; + if let Some(candidates) = candidates.as_mut() { + *candidates &= word_docids; + } else { + candidates = Some(word_docids); + } } else { return Ok(RoaringBitmap::new()); } } + let Some(mut candidates) = candidates else { + return Ok(RoaringBitmap::new()); + }; + let winsize = words.len().min(3); for win in words.windows(winsize) { From 25aac45fc7b1ceab292226c2d51a681adbd4b51f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Nov 2024 15:54:43 +0100 Subject: [PATCH 008/158] Expose better error messages --- crates/milli/src/error.rs | 4 ++++ crates/milli/src/update/new/channel.rs | 27 +++++++++++++++++++++- crates/milli/src/update/new/indexer/mod.rs | 26 +++++++++++++++++---- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 6c60dcecc..4da57a3e1 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -61,6 +61,10 @@ pub enum InternalError { Serialization(#[from] SerializationError), #[error(transparent)] Store(#[from] MdbError), + #[error("Cannot delete {key:?} from database {database_name}: {error}")] + StoreDeletion { database_name: &'static str, key: Vec, error: heed::Error }, + #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] + StorePut { database_name: &'static str, key: Vec, value_length: usize, error: heed::Error }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 3afcd3e4b..dda87d515 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -11,7 +11,7 @@ use super::extract::FacetKind; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; -use crate::index::IndexEmbeddingConfig; +use crate::index::{db_name, IndexEmbeddingConfig}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{DocumentId, Index}; @@ -139,6 +139,27 @@ impl Database { Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), } } + + pub fn database_name(&self) -> &'static str { + match self { + Database::Main => db_name::MAIN, + Database::Documents => db_name::DOCUMENTS, + Database::ExternalDocumentsIds => db_name::EXTERNAL_DOCUMENTS_IDS, + Database::ExactWordDocids => db_name::EXACT_WORD_DOCIDS, + Database::WordDocids => db_name::WORD_DOCIDS, + Database::WordFidDocids => db_name::WORD_FIELD_ID_DOCIDS, + Database::WordPositionDocids => db_name::WORD_POSITION_DOCIDS, + Database::FidWordCountDocids => db_name::FIELD_ID_WORD_COUNT_DOCIDS, + Database::WordPairProximityDocids => db_name::WORD_PAIR_PROXIMITY_DOCIDS, + Database::FacetIdIsNullDocids => db_name::FACET_ID_IS_NULL_DOCIDS, + Database::FacetIdIsEmptyDocids => db_name::FACET_ID_IS_EMPTY_DOCIDS, + Database::FacetIdExistsDocids => db_name::FACET_ID_EXISTS_DOCIDS, + Database::FacetIdF64NumberDocids => db_name::FACET_ID_F64_DOCIDS, + Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, + Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, + Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + } + } } impl From for Database { @@ -158,6 +179,10 @@ impl DbOperation { self.database.database(index) } + pub fn database_name(&self) -> &'static str { + self.database.database_name() + } + pub fn entry(self) -> EntryOperation { self.entry } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 01ac26503..0f533f5aa 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -41,7 +41,7 @@ use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; use crate::{ - FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, + Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, }; @@ -356,13 +356,29 @@ where match operation { WriterOperation::DbOperation(db_operation) => { let database = db_operation.database(index); + let database_name = db_operation.database_name(); match db_operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") + EntryOperation::Delete(e) => match database.delete(wtxn, e.entry()) { + Ok(false) => unreachable!("We tried to delete an unknown key"), + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError(InternalError::StoreDeletion { + database_name, + key: e.entry().to_owned(), + error, + })); + } + }, + EntryOperation::Write(e) => { + if let Err(error) = database.put(wtxn, e.key(), e.value()) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: e.key().to_owned(), + value_length: e.value().len(), + error, + })); } } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, } } WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { From a3103f347e3008247799506009eba19e6aa9171f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Nov 2024 16:05:31 +0100 Subject: [PATCH 009/158] Fix the facet f64 database name --- crates/milli/src/update/new/channel.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index dda87d515..00b471b52 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -112,7 +112,7 @@ pub enum Database { FacetIdIsNullDocids, FacetIdIsEmptyDocids, FacetIdExistsDocids, - FacetIdF64NumberDocids, + FacetIdF64Docids, FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, @@ -133,7 +133,7 @@ impl Database { Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(), Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(), Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), - Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), + Database::FacetIdF64Docids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), @@ -154,7 +154,7 @@ impl Database { Database::FacetIdIsNullDocids => db_name::FACET_ID_IS_NULL_DOCIDS, Database::FacetIdIsEmptyDocids => db_name::FACET_ID_IS_EMPTY_DOCIDS, Database::FacetIdExistsDocids => db_name::FACET_ID_EXISTS_DOCIDS, - Database::FacetIdF64NumberDocids => db_name::FACET_ID_F64_DOCIDS, + Database::FacetIdF64Docids => db_name::FACET_ID_F64_DOCIDS, Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, @@ -165,7 +165,7 @@ impl Database { impl From for Database { fn from(value: FacetKind) -> Self { match value { - FacetKind::Number => Database::FacetIdF64NumberDocids, + FacetKind::Number => Database::FacetIdF64Docids, FacetKind::String => Database::FacetIdStringDocids, FacetKind::Null => Database::FacetIdIsNullDocids, FacetKind::Empty => Database::FacetIdIsEmptyDocids, From 5606679c53a507b3778b957afe8d1b16e865a2d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Nov 2024 16:24:59 +0100 Subject: [PATCH 010/158] Use the obkv and grenad crates.io versions --- Cargo.lock | 9 +++++---- crates/meilisearch/Cargo.toml | 2 +- crates/milli/Cargo.toml | 7 ++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2de9007f5..d94ff0804 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2263,13 +2263,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.7" -source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e2ac9baf835ee2a7f0622a5617792ced6f65af25994078c343d429431ef2bbc" dependencies = [ "bytemuck", "byteorder", "either", - "rayon", "tempfile", ] @@ -3912,7 +3912,8 @@ dependencies = [ [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae4512a8f418ac322335255a72361b9ac927e106f4d7fe6ab4d8ac59cb01f7a9" [[package]] name = "once_cell" diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index b11d90151..2884f0c9c 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -57,7 +57,7 @@ meilisearch-types = { path = "../meilisearch-types" } mimalloc = { version = "0.1.43", default-features = false } mime = "0.3.17" num_cpus = "1.16.0" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } +obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" parking_lot = "0.12.3" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index c47a0a354..ccf6877cd 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -28,10 +28,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.7", default-features = false, features = [ - "rayon", # TODO Should we keep this feature - "tempfile", -], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } +grenad = { version = "0.5.0", default-features = false, features = ["tempfile"] } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", "serde-bincode", @@ -42,7 +39,7 @@ json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.5.0" memmap2 = "0.9.4" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } +obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" From b4fb2dabd46f40c6ae8f320a77ba4d9342cfefbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Nov 2024 16:31:21 +0100 Subject: [PATCH 011/158] Use the grenad rayon feature --- Cargo.lock | 1 + crates/milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index d94ff0804..0f2a13125 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2270,6 +2270,7 @@ dependencies = [ "bytemuck", "byteorder", "either", + "rayon", "tempfile", ] diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index ccf6877cd..1a3bfbcf1 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -28,7 +28,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.5.0", default-features = false, features = ["tempfile"] } +grenad = { version = "0.5.0", default-features = false, features = ["rayon", "tempfile"] } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", "serde-bincode", From d66dc363ed5cf3e7c1a7a59c05bed1722338e0ac Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Nov 2024 15:57:56 +0100 Subject: [PATCH 012/158] Test and implement settings opt-out --- crates/dump/src/lib.rs | 2 + crates/dump/src/reader/compat/v5_to_v6.rs | 2 + .../after_adding_the_documents.snap | 3 +- .../after_adding_the_settings.snap | 3 +- .../after_removing_the_documents.snap | 3 +- .../registered_the_document_deletions.snap | 3 +- ...red_the_setting_and_document_addition.snap | 3 +- .../Intel to kefir succeeds.snap | 5 +- .../lib.rs/import_vectors/Intel to kefir.snap | 5 +- .../import_vectors/adding Intel succeeds.snap | 5 +- .../import_vectors/after adding Intel.snap | 3 +- ...ter_registering_settings_task_vectors.snap | 3 +- .../settings_update_processed_vectors.snap | 3 +- .../after_registering_settings_task.snap | 3 +- .../settings_update_processed.snap | 3 +- crates/meilisearch-types/src/error.rs | 2 + crates/meilisearch-types/src/settings.rs | 71 ++- .../src/routes/indexes/settings.rs | 26 + .../src/routes/indexes/settings_analytics.rs | 45 +- crates/meilisearch/tests/dumps/mod.rs | 56 ++- .../meilisearch/tests/search/facet_search.rs | 112 +++++ .../tests/settings/get_settings.rs | 8 +- crates/meilisearch/tests/settings/mod.rs | 1 + .../tests/settings/prefix_search_settings.rs | 458 ++++++++++++++++++ crates/milli/src/index.rs | 61 ++- crates/milli/src/search/new/mod.rs | 9 + .../src/search/new/query_term/parse_query.rs | 3 +- crates/milli/src/update/facet/mod.rs | 7 + .../extract/extract_facet_string_docids.rs | 31 +- .../extract/extract_fid_docid_facet_values.rs | 8 +- .../milli/src/update/index_documents/mod.rs | 35 +- .../src/update/index_documents/transform.rs | 25 +- crates/milli/src/update/new/indexer/mod.rs | 5 +- .../milli/src/update/new/word_fst_builder.rs | 6 +- crates/milli/src/update/settings.rs | 86 +++- crates/milli/src/update/words_prefixes_fst.rs | 8 +- 36 files changed, 1018 insertions(+), 94 deletions(-) create mode 100644 crates/meilisearch/tests/settings/prefix_search_settings.rs diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 8bed7f0d4..31cd3028e 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -292,6 +292,8 @@ pub(crate) mod test { embedders: Setting::NotSet, search_cutoff_ms: Setting::NotSet, localized_attributes: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: std::marker::PhantomData, }; settings.check() diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 785542cce..6b2655bdf 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -382,6 +382,8 @@ impl From> for v6::Settings { embedders: v6::Setting::NotSet, localized_attributes: v6::Setting::NotSet, search_cutoff_ms: v6::Setting::NotSet, + facet_search: v6::Setting::NotSet, + prefix_search: v6::Setting::NotSet, _kind: std::marker::PhantomData, } } diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap index 8d175e388..bda90680f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap index d1de7ec61..be79abf21 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 114df2852..492eae3dd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap index b2b368be4..43be57779 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_document_ids: 1, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, status: enqueued, details: { original_filter: true, deleted_documents: None }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap index 9e1995fee..ca1866473 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap index 11995b0bd..f581defa8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap index 9c028d141..27522376f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap index 5c83f6cac..28504ffea 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap index c8f174c74..288f2bc88 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap index f9e6df03e..ff63c0caf 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap index 24d5fff27..77367f06b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index 22900371e..e2668fcea 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index dae9b38cd..7f08c0575 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 00f88b7b4..4b930bf8d 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -290,6 +290,8 @@ InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; +InvalidSettingsFacetSearch , InvalidRequest , BAD_REQUEST ; +InvalidSettingsPrefixSearch , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index e3803fa28..48481e364 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -8,7 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; -use milli::index::IndexEmbeddingConfig; +use milli::index::{IndexEmbeddingConfig, PrefixSearch}; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -202,6 +202,12 @@ pub struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub localized_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub facet_search: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub prefix_search: Setting, #[serde(skip)] #[deserr(skip)] @@ -266,6 +272,8 @@ impl Settings { embedders: Setting::Reset, search_cutoff_ms: Setting::Reset, localized_attributes: Setting::Reset, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData, } } @@ -290,6 +298,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = self; @@ -312,6 +322,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind: PhantomData, } } @@ -360,6 +372,8 @@ impl Settings { embedders: self.embedders, search_cutoff_ms: self.search_cutoff_ms, localized_attributes: self.localized_attributes, + facet_search: self.facet_search, + prefix_search: self.prefix_search, _kind: PhantomData, } } @@ -433,6 +447,8 @@ impl Settings { Setting::Set(this) } }, + prefix_search: other.prefix_search.or(self.prefix_search), + facet_search: other.facet_search.or(self.facet_search), _kind: PhantomData, } } @@ -469,6 +485,8 @@ pub fn apply_settings_to_builder( embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = settings; @@ -657,6 +675,20 @@ pub fn apply_settings_to_builder( Setting::Reset => builder.reset_search_cutoff(), Setting::NotSet => (), } + + match prefix_search { + Setting::Set(prefix_search) => { + builder.set_prefix_search(PrefixSearch::from(*prefix_search)) + } + Setting::Reset => builder.reset_prefix_search(), + Setting::NotSet => (), + } + + match facet_search { + Setting::Set(facet_search) => builder.set_facet_search(*facet_search), + Setting::Reset => builder.reset_facet_search(), + Setting::NotSet => (), + } } pub enum SecretPolicy { @@ -755,6 +787,10 @@ pub fn settings( let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; + let prefix_search = index.prefix_search(rtxn)?.map(PrefixSearchSettings::from); + + let facet_search = index.facet_search(rtxn)?; + let mut settings = Settings { displayed_attributes: match displayed_attributes { Some(attrs) => Setting::Set(attrs), @@ -791,13 +827,14 @@ pub fn settings( Some(rules) => Setting::Set(rules.into_iter().map(|r| r.into()).collect()), None => Setting::Reset, }, + prefix_search: Setting::Set(prefix_search.unwrap_or_default()), + facet_search: Setting::Set(facet_search), _kind: PhantomData, }; if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } - Ok(settings) } @@ -964,6 +1001,32 @@ impl std::ops::Deref for WildcardSetting { } } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub enum PrefixSearchSettings { + #[default] + IndexingTime, + Disabled, +} + +impl From for PrefixSearchSettings { + fn from(value: PrefixSearch) -> Self { + match value { + PrefixSearch::IndexingTime => PrefixSearchSettings::IndexingTime, + PrefixSearch::Disabled => PrefixSearchSettings::Disabled, + } + } +} +impl From for PrefixSearch { + fn from(value: PrefixSearchSettings) -> Self { + match value { + PrefixSearchSettings::IndexingTime => PrefixSearch::IndexingTime, + PrefixSearchSettings::Disabled => PrefixSearch::Disabled, + } + } +} + #[cfg(test)] pub(crate) mod test { use super::*; @@ -990,6 +1053,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; @@ -1019,6 +1084,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a9d8d3053..e1794535b 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -369,6 +369,30 @@ make_setting_route!( SearchCutoffMsAnalytics ); +make_setting_route!( + "/facet-search", + put, + bool, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, + >, + facet_search, + "facetSearch", + FacetSearchAnalytics +); + +make_setting_route!( + "/prefix-search", + put, + meilisearch_types::settings::PrefixSearchSettings, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, + >, + prefix_search, + "prefixSearch", + PrefixSearchAnalytics +); + macro_rules! generate_configure { ($($mod:ident),*) => { pub fn configure(cfg: &mut web::ServiceConfig) { @@ -456,6 +480,8 @@ pub async fn update_all( non_separator_tokens: NonSeparatorTokensAnalytics::new( new_settings.non_separator_tokens.as_ref().set(), ), + facet_search: FacetSearchAnalytics::new(new_settings.facet_search.as_ref().set()), + prefix_search: PrefixSearchAnalytics::new(new_settings.prefix_search.as_ref().set()), }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index 32bddcbdd..ddca2c00a 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -10,7 +10,8 @@ use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::settings::{ - FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings, + FacetingSettings, PaginationSettings, PrefixSearchSettings, ProximityPrecisionView, + RankingRuleView, TypoSettings, }; use serde::Serialize; @@ -36,6 +37,8 @@ pub struct SettingsAnalytics { pub dictionary: DictionaryAnalytics, pub separator_tokens: SeparatorTokensAnalytics, pub non_separator_tokens: NonSeparatorTokensAnalytics, + pub facet_search: FacetSearchAnalytics, + pub prefix_search: PrefixSearchAnalytics, } impl Aggregate for SettingsAnalytics { @@ -183,6 +186,14 @@ impl Aggregate for SettingsAnalytics { non_separator_tokens: NonSeparatorTokensAnalytics { total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), }, + facet_search: FacetSearchAnalytics { + set: new.facet_search.set | self.facet_search.set, + value: new.facet_search.value.or(self.facet_search.value), + }, + prefix_search: PrefixSearchAnalytics { + set: new.prefix_search.set | self.prefix_search.set, + value: new.prefix_search.value.or(self.prefix_search.value), + }, }) } @@ -620,3 +631,35 @@ impl NonSeparatorTokensAnalytics { SettingsAnalytics { non_separator_tokens: self, ..Default::default() } } } + +#[derive(Serialize, Default)] +pub struct FacetSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl FacetSearchAnalytics { + pub fn new(settings: Option<&bool>) -> Self { + Self { set: settings.is_some(), value: settings.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { facet_search: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PrefixSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl PrefixSearchAnalytics { + pub fn new(settings: Option<&PrefixSearchSettings>) -> Self { + Self { set: settings.is_some(), value: settings.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { prefix_search: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index c7d157b00..dbbd1abf0 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -79,7 +79,9 @@ async fn import_dump_v1_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -242,7 +244,9 @@ async fn import_dump_v1_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -391,7 +395,9 @@ async fn import_dump_v1_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -526,7 +532,9 @@ async fn import_dump_v2_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -673,7 +681,9 @@ async fn import_dump_v2_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -819,7 +829,9 @@ async fn import_dump_v2_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -954,7 +966,9 @@ async fn import_dump_v3_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1101,7 +1115,9 @@ async fn import_dump_v3_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1247,7 +1263,9 @@ async fn import_dump_v3_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1382,7 +1400,9 @@ async fn import_dump_v4_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1529,7 +1549,9 @@ async fn import_dump_v4_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1675,7 +1697,9 @@ async fn import_dump_v4_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1922,7 +1946,9 @@ async fn import_dump_v6_containing_experimental_features() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); @@ -2102,7 +2128,9 @@ async fn generate_and_import_dump_containing_vectors() { } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 12d2226a9..52b8171c4 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -200,3 +200,115 @@ async fn simple_facet_search_with_sort_by_count() { assert_eq!(hits[0], json!({ "value": "Action", "count": 3 })); assert_eq!(hits[1], json!({ "value": "Adventure", "count": 2 })); } + +#[actix_rt::test] +async fn add_documents_and_deactivate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(1).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0); +} + +#[actix_rt::test] +async fn deactivate_facet_search_and_add_documents() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_activate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": true, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_reset_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": serde_json::Value::Null, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 6de0db0b3..1b1964680 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -56,7 +56,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 17); + assert_eq!(settings.keys().len(), 19); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -87,6 +87,8 @@ async fn get_settings() { ); assert_eq!(settings["proximityPrecision"], json!("byWord")); assert_eq!(settings["searchCutoffMs"], json!(null)); + assert_eq!(settings["prefixSearch"], json!("indexingTime")); + assert_eq!(settings["facetSearch"], json!(true)); } #[actix_rt::test] @@ -199,7 +201,9 @@ async fn secrets_are_hidden_in_settings() { } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); diff --git a/crates/meilisearch/tests/settings/mod.rs b/crates/meilisearch/tests/settings/mod.rs index ccb4139e6..67df4068a 100644 --- a/crates/meilisearch/tests/settings/mod.rs +++ b/crates/meilisearch/tests/settings/mod.rs @@ -1,5 +1,6 @@ mod distinct; mod errors; mod get_settings; +mod prefix_search_settings; mod proximity_settings; mod tokenizer_customization; diff --git a/crates/meilisearch/tests/settings/prefix_search_settings.rs b/crates/meilisearch/tests/settings/prefix_search_settings.rs new file mode 100644 index 000000000..34a891f97 --- /dev/null +++ b/crates/meilisearch/tests/settings/prefix_search_settings.rs @@ -0,0 +1,458 @@ +use meili_snap::{json_string, snapshot}; +use once_cell::sync::Lazy; + +use crate::common::Server; +use crate::json; + +static DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + }, + ]) +}); + +#[actix_rt::test] +async fn add_docs_and_disable() { + let server = Server::new().await; + let index = server.index("test"); + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(0).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(1).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + // only 1 document should match + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_and_add_docs() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(1).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_enable() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(1).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "indexingTime", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_reset() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(1).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": serde_json::Value::Null, + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn default_behavior() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(1).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 89f965b7c..5bd24b9e4 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -70,6 +70,8 @@ pub mod main_key { pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; pub const SEARCH_CUTOFF: &str = "search_cutoff"; pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; + pub const FACET_SEARCH: &str = "facet_search"; + pub const PREFIX_SEARCH: &str = "prefix_search"; } pub mod db_name { @@ -1233,6 +1235,10 @@ impl Index { ) } + pub(crate) fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY) + } + /// Returns the FST which is the words prefixes dictionary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn<'t>) -> Result>> { match self.main.remap_types::().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { @@ -1562,6 +1568,41 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) } + pub fn prefix_search(&self, txn: &RoTxn<'_>) -> heed::Result> { + self.main.remap_types::>().get(txn, main_key::PREFIX_SEARCH) + } + + pub(crate) fn put_prefix_search( + &self, + txn: &mut RwTxn<'_>, + val: PrefixSearch, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + txn, + main_key::PREFIX_SEARCH, + &val, + ) + } + + pub(crate) fn delete_prefix_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::PREFIX_SEARCH) + } + + pub fn facet_search(&self, txn: &RoTxn<'_>) -> heed::Result { + self.main + .remap_types::>() + .get(txn, main_key::FACET_SEARCH) + .map(|v| v.unwrap_or(true)) + } + + pub(crate) fn put_facet_search(&self, txn: &mut RwTxn<'_>, val: bool) -> heed::Result<()> { + self.main.remap_types::>().put(txn, main_key::FACET_SEARCH, &val) + } + + pub(crate) fn delete_facet_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::FACET_SEARCH) + } + pub fn localized_attributes_rules( &self, rtxn: &RoTxn<'_>, @@ -1647,10 +1688,14 @@ impl Index { Ok(res) } - pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result { + pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result { + let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); Ok(PrefixSettings { - compute_prefixes: true, + compute_prefixes, max_prefix_length: 4, + #[cfg(not(test))] + prefix_count_threshold: 100, + #[cfg(test)] prefix_count_threshold: 100, }) } @@ -1665,9 +1710,17 @@ pub struct IndexEmbeddingConfig { #[derive(Debug, Deserialize, Serialize)] pub struct PrefixSettings { - pub prefix_count_threshold: u64, + pub prefix_count_threshold: usize, pub max_prefix_length: usize, - pub compute_prefixes: bool, + pub compute_prefixes: PrefixSearch, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub enum PrefixSearch { + #[default] + IndexingTime, + Disabled, } #[derive(Serialize, Deserialize)] diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index f7c590360..4edcd09de 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; +use crate::index::PrefixSearch; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; @@ -68,6 +69,7 @@ pub struct SearchContext<'ctx> { pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, pub restricted_fids: Option, + pub prefix_search: PrefixSearch, } impl<'ctx> SearchContext<'ctx> { @@ -85,6 +87,8 @@ impl<'ctx> SearchContext<'ctx> { } } + let prefix_search = index.prefix_search(txn)?.unwrap_or_default(); + Ok(Self { index, txn, @@ -94,9 +98,14 @@ impl<'ctx> SearchContext<'ctx> { term_interner: <_>::default(), phrase_docids: <_>::default(), restricted_fids: None, + prefix_search, }) } + pub fn is_prefix_search_allowed(&self) -> bool { + self.prefix_search != PrefixSearch::Disabled + } + pub fn attributes_to_search_on( &mut self, attributes_to_search_on: &'ctx [String], diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index bb98f19ce..a76fd6525 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -28,6 +28,7 @@ pub fn located_query_terms_from_tokens( words_limit: Option, ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; + let allow_prefix_search = ctx.is_prefix_search_allowed(); let mut query_terms = Vec::new(); @@ -94,7 +95,7 @@ pub fn located_query_terms_from_tokens( ctx, word, nbr_typos(word), - true, + allow_prefix_search, false, )?; let located_term = LocatedQueryTerm { diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index 2e592519b..f4835e6a8 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -173,6 +173,13 @@ impl<'i> FacetsUpdate<'i> { } match self.normalized_delta_data { + _ if !self.index.facet_search(wtxn)? => { + // If facet search is disabled, we don't need to compute facet search databases. + // We clear the facet search databases. + self.index.facet_id_string_fst.clear(wtxn)?; + self.index.facet_id_normalized_string_strings.clear(wtxn)?; + return Ok(()); + } Some(data) => index_facet_search(wtxn, data, self.index), None => Ok(()), } diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e0d7e1386..d330ea5a0 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -34,10 +34,12 @@ pub fn extract_facet_string_docids( extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; + let facet_search = settings_diff.new.facet_search; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, localized_field_ids, + facet_search, ) } } @@ -51,6 +53,7 @@ fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, localized_field_ids: &LocalizedFieldIds, + facet_search: bool, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -96,7 +99,7 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { + if facet_search { let locales = localized_field_ids.locales(field_id); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); @@ -179,8 +182,10 @@ fn extract_facet_string_docids_settings( let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); let are_same_locales = old_locales == new_locales; + let reindex_facet_search = + settings_diff.new.facet_search && !settings_diff.old.facet_search; - if is_same_value && are_same_locales { + if is_same_value && are_same_locales && !reindex_facet_search { continue; } @@ -191,18 +196,26 @@ fn extract_facet_string_docids_settings( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { - let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); - let new_hyper_normalized_value = if are_same_locales { - &old_hyper_normalized_value + if settings_diff.new.facet_search { + let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); + let old_hyper_normalized_value; + let old_hyper_normalized_value = if !settings_diff.old.facet_search + || deladd_reader.get(DelAdd::Deletion).is_none() + { + // if the facet search is disabled in the old settings or if no facet string is deleted, + // we don't need to normalize the facet string. + None + } else if are_same_locales { + Some(&new_hyper_normalized_value) } else { - &normalize_facet_string(normalized_value, new_locales) + old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); + Some(&old_hyper_normalized_value) }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { + if old_hyper_normalized_value == Some(&new_hyper_normalized_value) { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -222,7 +235,7 @@ fn extract_facet_string_docids_settings( } else { // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. // deletion - if deladd_reader.get(DelAdd::Deletion).is_some() { + if let Some(old_hyper_normalized_value) = old_hyper_normalized_value { // insert old value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 047669521..88c02fe70 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -80,7 +80,7 @@ pub fn extract_fid_docid_facet_values( let new_faceted_fids: BTreeSet<_> = settings_diff.new.faceted_fields_ids.iter().copied().collect(); - if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::from_slice(value); @@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values( (field_id, None, add_value) } EitherOrBoth::Both(&field_id, _) => { - // during settings update, recompute the changing settings only. - if settings_diff.settings_update_only { + // during settings update, recompute the changing settings only unless a global change is detected. + if settings_diff.settings_update_only + && !settings_diff.global_facet_settings_changed() + { continue; } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index baecbdcf0..186cc501d 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -29,6 +29,7 @@ pub use self::transform::{Transform, TransformOutput}; use super::new::StdResult; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; +use crate::index::{PrefixSearch, PrefixSettings}; use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ @@ -82,8 +83,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub words_prefix_threshold: Option, - pub max_prefix_length: Option, pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, @@ -565,14 +564,32 @@ where self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; // Run the words prefixes update operation. - let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); - if let Some(value) = self.config.words_prefix_threshold { - builder.threshold(value); + let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = + self.index.prefix_settings(self.wtxn)?; + + // If the prefix search is enabled at indexing time, we compute the prefixes. + if compute_prefixes == PrefixSearch::IndexingTime { + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); + builder.threshold(prefix_count_threshold); + builder.max_prefix_length(max_prefix_length); + builder.execute()?; + } else { + // If the prefix search is disabled at indexing time, we delete the previous words prefixes fst. + // And all the associated docids databases. + self.index.delete_words_prefixes_fst(self.wtxn)?; + self.index.word_prefix_docids.clear(self.wtxn)?; + self.index.exact_word_prefix_docids.clear(self.wtxn)?; + self.index.word_prefix_position_docids.clear(self.wtxn)?; + self.index.word_prefix_fid_docids.clear(self.wtxn)?; + + databases_seen += 3; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + return Ok(()); } - if let Some(value) = self.config.max_prefix_length { - builder.max_prefix_length(value); - } - builder.execute()?; if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index 38bf90435..7477b5667 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -667,14 +667,23 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let must_reindex_facets = settings_diff.reindex_facets(); - let necessary_faceted_field = |id: FieldId| -> bool { - let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - must_reindex_facets - && modified_faceted_fields - .iter() - .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) - }; + let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); + let facet_fids_changed = settings_diff.facet_fids_changed(); + let necessary_faceted_field = + |id: FieldId| -> bool { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + if global_facet_settings_changed { + settings_diff.new.user_defined_faceted_fields.iter().any(|long| { + is_faceted_by(long, field_name) || is_faceted_by(field_name, long) + }) + } else if facet_fids_changed { + modified_faceted_fields.iter().any(|long| { + is_faceted_by(long, field_name) || is_faceted_by(field_name, long) + }) + } else { + false + } + }; // Alway provide all fields when vectors are involved because // we need the fields for the prompt/templating. diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0f533f5aa..f1f5d96d0 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -445,7 +445,10 @@ where (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + if index.facet_search(wtxn)? { + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + } + compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); diff --git a/crates/milli/src/update/new/word_fst_builder.rs b/crates/milli/src/update/new/word_fst_builder.rs index 2b1c4604b..6bc72d91d 100644 --- a/crates/milli/src/update/new/word_fst_builder.rs +++ b/crates/milli/src/update/new/word_fst_builder.rs @@ -80,12 +80,12 @@ pub struct PrefixDelta { } struct PrefixFstBuilder { - prefix_count_threshold: u64, + prefix_count_threshold: usize, max_prefix_length: usize, /// TODO: Replace the full memory allocation prefix_fst_builders: Vec>>, current_prefix: Vec, - current_prefix_count: Vec, + current_prefix_count: Vec, modified_prefixes: HashSet, current_prefix_is_modified: Vec, } @@ -95,7 +95,7 @@ impl PrefixFstBuilder { let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = prefix_settings; - if !compute_prefixes { + if compute_prefixes != crate::index::PrefixSearch::IndexingTime { return None; } diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index ccfdb1711..3d2702479 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -17,7 +17,8 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{ - IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; use crate::prompt::default_max_bytes; @@ -177,6 +178,8 @@ pub struct Settings<'a, 't, 'i> { embedder_settings: Setting>>, search_cutoff: Setting, localized_attributes_rules: Setting>, + prefix_search: Setting, + facet_search: Setting, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -212,6 +215,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { embedder_settings: Setting::NotSet, search_cutoff: Setting::NotSet, localized_attributes_rules: Setting::NotSet, + prefix_search: Setting::NotSet, + facet_search: Setting::NotSet, indexer_config, } } @@ -418,6 +423,22 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.localized_attributes_rules = Setting::Reset; } + pub fn set_prefix_search(&mut self, value: PrefixSearch) { + self.prefix_search = Setting::Set(value); + } + + pub fn reset_prefix_search(&mut self) { + self.prefix_search = Setting::Reset; + } + + pub fn set_facet_search(&mut self, value: bool) { + self.facet_search = Setting::Set(value); + } + + pub fn reset_facet_search(&mut self) { + self.facet_search = Setting::Reset; + } + #[tracing::instrument( level = "trace" skip(self, progress_callback, should_abort, settings_diff), @@ -944,7 +965,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { false } else { self.index.put_proximity_precision(self.wtxn, new)?; - true + old.is_some() || new != ProximityPrecision::default() } } Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, @@ -954,6 +975,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } + fn update_prefix_search(&mut self) -> Result { + let changed = match self.prefix_search { + Setting::Set(new) => { + let old = self.index.prefix_search(self.wtxn)?; + if old == Some(new) { + false + } else { + self.index.put_prefix_search(self.wtxn, new)?; + old.is_some() || new != PrefixSearch::default() + } + } + Setting::Reset => self.index.delete_prefix_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + + fn update_facet_search(&mut self) -> Result { + let changed = match self.facet_search { + Setting::Set(new) => { + let old = self.index.facet_search(self.wtxn)?; + if old == new { + false + } else { + self.index.put_facet_search(self.wtxn, new)?; + true + } + } + Setting::Reset => self.index.delete_facet_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + fn update_embedding_configs(&mut self) -> Result> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), @@ -1203,6 +1260,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; + self.update_prefix_search()?; + self.update_facet_search()?; self.update_localized_attributes_rules()?; let embedding_config_updates = self.update_embedding_configs()?; @@ -1282,6 +1341,7 @@ impl InnerIndexSettingsDiff { || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision + || old_settings.prefix_search != new_settings.prefix_search || old_settings.localized_searchable_fields_ids != new_settings.localized_searchable_fields_ids }; @@ -1372,7 +1432,7 @@ impl InnerIndexSettingsDiff { } } - pub fn reindex_facets(&self) -> bool { + pub fn facet_fids_changed(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { return true; @@ -1392,7 +1452,15 @@ impl InnerIndexSettingsDiff { } (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) - || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + } + + pub fn global_facet_settings_changed(&self) -> bool { + self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + || self.old.facet_search != self.new.facet_search + } + + pub fn reindex_facets(&self) -> bool { + self.facet_fids_changed() || self.global_facet_settings_changed() } pub fn reindex_vectors(&self) -> bool { @@ -1432,6 +1500,8 @@ pub(crate) struct InnerIndexSettings { pub non_faceted_fields_ids: Vec, pub localized_searchable_fields_ids: LocalizedFieldIds, pub localized_faceted_fields_ids: LocalizedFieldIds, + pub prefix_search: PrefixSearch, + pub facet_search: bool, } impl InnerIndexSettings { @@ -1457,6 +1527,8 @@ impl InnerIndexSettings { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs(rtxn)?)?, }; + let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); + let facet_search = index.facet_search(rtxn)?; let existing_fields: HashSet<_> = index .field_distribution(rtxn)? .into_iter() @@ -1514,6 +1586,8 @@ impl InnerIndexSettings { non_faceted_fields_ids: vectors_fids.clone(), localized_searchable_fields_ids, localized_faceted_fields_ids, + prefix_search, + facet_search, }) } @@ -2721,6 +2795,8 @@ mod tests { embedder_settings, search_cutoff, localized_attributes_rules, + prefix_search, + facet_search, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -2746,6 +2822,8 @@ mod tests { assert!(matches!(embedder_settings, Setting::NotSet)); assert!(matches!(search_cutoff, Setting::NotSet)); assert!(matches!(localized_attributes_rules, Setting::NotSet)); + assert!(matches!(prefix_search, Setting::NotSet)); + assert!(matches!(facet_search, Setting::NotSet)); }) .unwrap(); } diff --git a/crates/milli/src/update/words_prefixes_fst.rs b/crates/milli/src/update/words_prefixes_fst.rs index d47d6d14c..d18bfa74c 100644 --- a/crates/milli/src/update/words_prefixes_fst.rs +++ b/crates/milli/src/update/words_prefixes_fst.rs @@ -9,7 +9,7 @@ use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'i> { wtxn: &'t mut RwTxn<'i>, index: &'i Index, - threshold: u32, + threshold: usize, max_prefix_length: usize, } @@ -24,8 +24,8 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// /// Default value is 100. This value must be higher than 50 and will be clamped /// to this bound otherwise. - pub fn threshold(&mut self, value: u32) -> &mut Self { - self.threshold = value.max(50); + pub fn threshold(&mut self, value: usize) -> &mut Self { + self.threshold = value; self } @@ -34,7 +34,7 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped /// to these bounds, otherwise. pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.clamp(1, 25); + self.max_prefix_length = value; self } From aa460819a75f3184cb363fb24b915ff946b4171e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 25 Nov 2024 16:09:15 +0100 Subject: [PATCH 013/158] Add more precise spans --- crates/milli/src/update/new/indexer/mod.rs | 373 ++++++++++++--------- 1 file changed, 207 insertions(+), 166 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0f533f5aa..e285ca9cb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -109,55 +109,71 @@ where let rtxn = index.read_txn()?; + // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.documents(); let document_extractor = DocumentsExtractor::new(&document_sender, embedders); let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - - extract(document_changes, - &document_extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingDocuments, - )?; - - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - document_extractor_data.docids_delta.apply_to(document_ids); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + let _entered = span.enter(); + extract(document_changes, + &document_extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingDocuments, + )?; } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); + let _entered = span.enter(); + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); + } - field_distribution.retain(|_, v| *v != 0); + field_distribution.retain(|_, v| *v != 0); + } let facet_field_ids_delta; { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); - let _entered = span.enter(); + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); + let _entered = span.enter(); - facet_field_ids_delta = merge_and_send_facet_docids( FacetedDocidsExtractor::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets - )?, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), + Step::ExtractingFacets + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); + let _entered = span.enter(); + + facet_field_ids_delta = merge_and_send_facet_docids( + caches, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; + } } { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); + + let WordDocidsCaches { @@ -166,15 +182,19 @@ where exact_word_docids, word_position_docids, fid_word_count_docids, - } = WordDocidsExtractors::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWords - )?; + } = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + + WordDocidsExtractors::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + Step::ExtractingWords + )? + }; - // TODO Word Docids Merger { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); let _entered = span.enter(); @@ -187,7 +207,6 @@ where )?; } - // Word Fid Docids Merging { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); let _entered = span.enter(); @@ -200,7 +219,6 @@ where )?; } - // Exact Word Docids Merging { let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); let _entered = span.enter(); @@ -213,7 +231,6 @@ where )?; } - // Word Position Docids Merging { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); let _entered = span.enter(); @@ -226,7 +243,6 @@ where )?; } - // Fid Word Count Docids Merging { let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); let _entered = span.enter(); @@ -244,30 +260,34 @@ where // this works only if the settings didn't change during this transaction. let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + ::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + Step::ExtractingWordProximity, + )? + }; - let caches = ::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWordProximity, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } } 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); let mut index_embeddings = index.embedding_configs(&rtxn)?; if index_embeddings.is_empty() { @@ -277,13 +297,22 @@ where let embedding_sender = extractor_sender.embeddings(); let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); + + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); + } } } @@ -291,21 +320,24 @@ where } 'geo: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { break 'geo; }; let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::WritingGeoPoints - )?; + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); + + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::WritingGeoPoints + )?; + } merge_and_send_rtree( datastore, @@ -316,11 +348,7 @@ where )?; } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - } + (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); Result::Ok(facet_field_ids_delta) })?; @@ -352,90 +380,103 @@ where .collect(); let mut arroy_writers = arroy_writers?; - for operation in writer_receiver { - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - let database_name = db_operation.database_name(); - match db_operation.entry() { - EntryOperation::Delete(e) => match database.delete(wtxn, e.entry()) { - Ok(false) => unreachable!("We tried to delete an unknown key"), - Ok(_) => (), - Err(error) => { - return Err(Error::InternalError(InternalError::StoreDeletion { - database_name, - key: e.entry().to_owned(), - error, - })); - } - }, - EntryOperation::Write(e) => { - if let Err(error) = database.put(wtxn, e.key(), e.value()) { - return Err(Error::InternalError(InternalError::StorePut { - database_name, - key: e.key().to_owned(), - value_length: e.value().len(), - error, - })); + { + let span = tracing::trace_span!(target: "indexing::write_db", "all"); + let _entered = span.enter(); + + for operation in writer_receiver { + match operation { + WriterOperation::DbOperation(db_operation) => { + let database = db_operation.database(index); + let database_name = db_operation.database_name(); + match db_operation.entry() { + EntryOperation::Delete(e) => match database.delete(wtxn, e.entry()) { + Ok(false) => unreachable!("We tried to delete an unknown key"), + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError( + InternalError::StoreDeletion { + database_name, + key: e.entry().to_owned(), + error, + }, + )); + } + }, + EntryOperation::Write(e) => { + if let Err(error) = database.put(wtxn, e.key(), e.value()) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: e.key().to_owned(), + value_length: e.value().len(), + error, + })); + } } } } + WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { + ArroyOperation::DeleteVectors { docid } => { + for ( + _embedder_index, + (_embedder_name, _embedder, writer, dimensions), + ) in &mut arroy_writers + { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + ArroyOperation::SetVectors { + docid, + embedder_id, + embeddings: raw_embeddings, + } => { + let (_, _, writer, dimensions) = arroy_writers + .get(&embedder_id) + .expect("requested a missing embedder"); + + let mut embeddings = Embeddings::new(*dimensions); + for embedding in raw_embeddings { + embeddings.append(embedding).unwrap(); + } + + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + ArroyOperation::SetVector { docid, embedder_id, embedding } => { + let (_, _, writer, dimensions) = arroy_writers + .get(&embedder_id) + .expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, &embedding)?; + } + ArroyOperation::Finish { configs } => { + let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + + (indexing_context.send_progress)(Progress::from_step( + Step::WritingEmbeddingsToDatabase, + )); + + for ( + _embedder_index, + (_embedder_name, _embedder, writer, dimensions), + ) in &mut arroy_writers + { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + &mut rng, + dimensions, + false, + &indexing_context.must_stop_processing, + )?; + } + + index.put_embedding_configs(wtxn, configs)?; + } + }, } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.del_items(wtxn, dimensions, docid)?; - } - } - ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings: raw_embeddings, - } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - // TODO: switch to Embeddings - let mut embeddings = Embeddings::new(*dimensions); - for embedding in raw_embeddings { - embeddings.append(embedding).unwrap(); - } - - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; - } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - ArroyOperation::Finish { configs } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); - - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); - - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.build_and_quantize( - wtxn, - &mut rng, - dimensions, - false, - &indexing_context.must_stop_processing, - )?; - } - - index.put_embedding_configs(wtxn, configs)?; - } - }, } } From fa15be5bc46e0f2543860fcd704f13649d522c5b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 25 Nov 2024 16:28:57 +0100 Subject: [PATCH 014/158] Add span around commit --- crates/index-scheduler/src/batch.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 630471790..04cdb912f 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1024,7 +1024,13 @@ impl IndexScheduler { let mut index_wtxn = index.write_txn()?; let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; - index_wtxn.commit()?; + + { + let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); + let _entered = span.enter(); + + index_wtxn.commit()?; + } // if the update processed successfully, we're going to store the new // stats of the index. Since the tasks have already been processed and From d7bcfb2d197e2b225ae8716308323acc1d3ef176 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 26 Nov 2024 14:04:16 +0100 Subject: [PATCH 015/158] fix clippy --- crates/milli/src/update/facet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index f4835e6a8..3eaf2f221 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -178,7 +178,7 @@ impl<'i> FacetsUpdate<'i> { // We clear the facet search databases. self.index.facet_id_string_fst.clear(wtxn)?; self.index.facet_id_normalized_string_strings.clear(wtxn)?; - return Ok(()); + Ok(()) } Some(data) => index_facet_search(wtxn, data, self.index), None => Ok(()), From 9008ecda3d0af01661024e79c649cb33fe270641 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Tue, 26 Nov 2024 14:44:24 +0100 Subject: [PATCH 016/158] Update crates/meilisearch-types/src/settings.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- crates/meilisearch-types/src/settings.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 48481e364..b12dfc9a2 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -272,8 +272,8 @@ impl Settings { embedders: Setting::Reset, search_cutoff_ms: Setting::Reset, localized_attributes: Setting::Reset, - facet_search: Setting::NotSet, - prefix_search: Setting::NotSet, + facet_search: Setting::Reset, + prefix_search: Setting::Reset, _kind: PhantomData, } } From f014e786840097e6a02415ef064ccee8c6986e41 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Tue, 26 Nov 2024 14:46:01 +0100 Subject: [PATCH 017/158] Update crates/milli/src/index.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- crates/milli/src/index.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 5bd24b9e4..b2f3cdbd1 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1693,9 +1693,6 @@ impl Index { Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, - #[cfg(not(test))] - prefix_count_threshold: 100, - #[cfg(test)] prefix_count_threshold: 100, }) } From 8f57b4fdf48c605dced71a4e8a89665fe32eabe9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 26 Nov 2024 14:10:52 +0100 Subject: [PATCH 018/158] Span to measure the part of db writes that is after the merge/extraction --- crates/milli/src/update/new/indexer/mod.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e285ca9cb..e7c5e30a6 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::sync::atomic::AtomicBool; use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; @@ -76,6 +77,7 @@ where SP: Fn(Progress) + Sync, { let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); @@ -100,6 +102,7 @@ where thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); let embedders = &embedders; + let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; @@ -350,6 +353,8 @@ where (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); + Result::Ok(facet_field_ids_delta) })?; @@ -384,7 +389,15 @@ where let span = tracing::trace_span!(target: "indexing::write_db", "all"); let _entered = span.enter(); + let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); + let mut _entered_post_merge = None; + for operation in writer_receiver { + if _entered_post_merge.is_none() + && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) + { + _entered_post_merge = Some(span.enter()); + } match operation { WriterOperation::DbOperation(db_operation) => { let database = db_operation.database(index); From 2e896f30a50519d3c5435779dd2b1c41b66f158b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 26 Nov 2024 15:53:54 +0100 Subject: [PATCH 019/158] Fix PR comments --- .../meilisearch/tests/search/facet_search.rs | 60 +++++++++---------- .../tests/settings/prefix_search_settings.rs | 30 +++++----- crates/milli/src/index.rs | 6 +- crates/milli/src/update/facet/mod.rs | 15 ++--- 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 52b8171c4..8fbeae293 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -41,8 +41,8 @@ async fn simple_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -65,8 +65,8 @@ async fn advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "enabled": false })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -89,8 +89,8 @@ async fn more_advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -113,8 +113,8 @@ async fn simple_facet_search_with_max_values() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -135,8 +135,8 @@ async fn simple_facet_search_by_count_with_max_values() { ) .await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -151,8 +151,8 @@ async fn non_filterable_facet_search_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -170,8 +170,8 @@ async fn facet_search_dont_support_words() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "words"})).await; @@ -188,8 +188,8 @@ async fn simple_facet_search_with_sort_by_count() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -207,8 +207,8 @@ async fn add_documents_and_deactivate_facet_search() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ "facetSearch": false, @@ -216,7 +216,7 @@ async fn add_documents_and_deactivate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -237,10 +237,10 @@ async fn deactivate_facet_search_and_add_documents() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -261,10 +261,10 @@ async fn deactivate_facet_search_add_documents_and_activate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ @@ -272,7 +272,7 @@ async fn deactivate_facet_search_add_documents_and_activate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(2).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -293,10 +293,10 @@ async fn deactivate_facet_search_add_documents_and_reset_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ @@ -304,7 +304,7 @@ async fn deactivate_facet_search_add_documents_and_reset_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(2).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; diff --git a/crates/meilisearch/tests/settings/prefix_search_settings.rs b/crates/meilisearch/tests/settings/prefix_search_settings.rs index 34a891f97..5da758a7d 100644 --- a/crates/meilisearch/tests/settings/prefix_search_settings.rs +++ b/crates/meilisearch/tests/settings/prefix_search_settings.rs @@ -29,8 +29,8 @@ async fn add_docs_and_disable() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(0).await; + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ @@ -39,7 +39,7 @@ async fn add_docs_and_disable() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await; // only 1 document should match index @@ -96,10 +96,10 @@ async fn disable_and_add_docs() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; // only 1 document should match index @@ -155,10 +155,10 @@ async fn disable_add_docs_and_enable() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ @@ -263,10 +263,10 @@ async fn disable_add_docs_and_reset() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; let (response, code) = index .update_settings(json!({ @@ -370,10 +370,10 @@ async fn default_behavior() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await; - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; // all documents should match index diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b2f3cdbd1..fe83877a7 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1690,11 +1690,7 @@ impl Index { pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result { let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); - Ok(PrefixSettings { - compute_prefixes, - max_prefix_length: 4, - prefix_count_threshold: 100, - }) + Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } } diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index 3eaf2f221..911296577 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -172,14 +172,15 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } + if !self.index.facet_search(wtxn)? { + // If facet search is disabled, we don't need to compute facet search databases. + // We clear the facet search databases. + self.index.facet_id_string_fst.clear(wtxn)?; + self.index.facet_id_normalized_string_strings.clear(wtxn)?; + return Ok(()); + } + match self.normalized_delta_data { - _ if !self.index.facet_search(wtxn)? => { - // If facet search is disabled, we don't need to compute facet search databases. - // We clear the facet search databases. - self.index.facet_id_string_fst.clear(wtxn)?; - self.index.facet_id_normalized_string_strings.clear(wtxn)?; - Ok(()) - } Some(data) => index_facet_search(wtxn, data, self.index), None => Ok(()), } From 18a9af353c262e776104c7d0a7ac8cdba96a8850 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 27 Nov 2024 11:12:08 +0100 Subject: [PATCH 020/158] Update Charabia version to v0.9.2 --- Cargo.lock | 12 +++++++----- crates/milli/Cargo.toml | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f2a13125..e4789da4a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -969,8 +969,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf8921fe4d53ab8f9e8f9b72ce6f91726cfc40fffab1243d27db406b5e2e9cc2" dependencies = [ "aho-corasick", "csv", @@ -2709,7 +2710,8 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" dependencies = [ "csv", "once_cell", @@ -6017,9 +6019,9 @@ dependencies = [ [[package]] name = "wana_kana" -version = "3.0.0" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "477976a5c56fb7b014795df5a2ce08d2de8bcd4d5980844c5bd3978a7fd1c30b" +checksum = "a74666202acfcb4f9b995be2e3e9f7f530deb65e05a1407b8d0b30c9c451238a" dependencies = [ "fnv", "itertools 0.10.5", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 1a3bfbcf1..a0bd86a42 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -18,8 +18,7 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -# charabia = { version = "0.9.0", default-features = false } -charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false } +charabia = { version = "0.9.2", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" From 79671c9faa2bdc2e2dcd83191a31a00e7175d2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Nov 2024 12:19:32 +0100 Subject: [PATCH 021/158] Implement a first version of the bbqueue channels --- Cargo.lock | 7 ++++ crates/milli/Cargo.toml | 2 ++ crates/milli/src/update/new/channel.rs | 46 ++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e4789da4a..e2069db87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -489,6 +489,11 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bbqueue" +version = "0.5.1" +source = "git+https://github.com/kerollmops/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" + [[package]] name = "benchmarks" version = "1.12.0" @@ -3611,6 +3616,7 @@ version = "1.12.0" dependencies = [ "allocator-api2", "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bbqueue", "big_s", "bimap", "bincode", @@ -3623,6 +3629,7 @@ dependencies = [ "candle-transformers", "charabia", "concat-arrays", + "crossbeam", "crossbeam-channel", "csv", "deserr", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index a0bd86a42..798a4ea19 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -98,6 +98,8 @@ allocator-api2 = "0.2.18" rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" +bbqueue = { git = "https://github.com/kerollmops/bbqueue" } +crossbeam = "0.8.4" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 00b471b52..21cd6b87d 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,6 +1,7 @@ use std::marker::PhantomData; use std::sync::atomic::{AtomicUsize, Ordering}; +use crossbeam::sync::{Parker, Unparker}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; use heed::BytesDecode; @@ -8,6 +9,7 @@ use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; @@ -16,6 +18,50 @@ use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{DocumentId, Index}; +/// Creates a tuple of producer/receivers to be used by +/// the extractors and the writer loop. +/// +/// # Safety +/// +/// Panics if the number of provided bbqueue is not exactly equal +/// to the number of available threads in the rayon threadpool. +pub fn extractor_writer_bbqueue( + bbqueue: &[bbqueue::BBBuffer], +) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { + assert_eq!( + bbqueue.len(), + rayon::current_num_threads(), + "You must provide as many BBBuffer as the available number of threads to extract" + ); + + let parker = Parker::new(); + let extractors = ThreadLocal::with_capacity(bbqueue.len()); + let producers = rayon::broadcast(|bi| { + let bbqueue = &bbqueue[bi.index()]; + let (producer, consumer) = bbqueue.try_split_framed().unwrap(); + extractors.get_or(|| FullySend(producer)); + consumer + }); + + ( + ExtractorBbqueueSender { inner: extractors, unparker: parker.unparker().clone() }, + WriterBbqueueReceiver { inner: producers, parker }, + ) +} + +pub struct ExtractorBbqueueSender<'a> { + inner: ThreadLocal>>, + /// Used to wake up the receiver thread, + /// Used everytime we write something in the producer. + unparker: Unparker, +} + +pub struct WriterBbqueueReceiver<'a> { + inner: Vec>, + /// Used to park when no more work is required + parker: Parker, +} + /// The capacity of the channel is currently in number of messages. pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); From 8442db8101ccc7df7e5b5ab98f8be593d659700a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Nov 2024 18:30:44 +0100 Subject: [PATCH 022/158] Implement mostly all senders --- .../cbo_roaring_bitmap_codec.rs | 19 + crates/milli/src/update/new/channel.rs | 641 ++++++++++-------- .../milli/src/update/new/extract/documents.rs | 11 +- .../src/update/new/extract/vectors/mod.rs | 6 +- crates/milli/src/update/new/indexer/mod.rs | 8 +- crates/milli/src/update/new/merger.rs | 17 +- 6 files changed, 398 insertions(+), 304 deletions(-) diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 257d5bd0a..cae1874dd 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -41,6 +41,25 @@ impl CboRoaringBitmapCodec { } } + pub fn serialize_into_writer( + roaring: &RoaringBitmap, + mut writer: W, + ) -> io::Result<()> { + if roaring.len() <= THRESHOLD as u64 { + // If the number of items (u32s) to encode is less than or equal to the threshold + // it means that it would weigh the same or less than the RoaringBitmap + // header, so we directly encode them using ByteOrder instead. + for integer in roaring { + writer.write_u32::(integer)?; + } + } else { + // Otherwise, we use the classic RoaringBitmapCodec that writes a header. + roaring.serialize_into(writer)?; + } + + Ok(()) + } + pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { if bytes.len() <= THRESHOLD * size_of::() { // If there is threshold or less than threshold integers that can fit into this array diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 21cd6b87d..cacc7b129 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,14 +1,19 @@ +use std::cell::RefCell; use std::marker::PhantomData; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::num::NonZeroU16; +use std::{mem, slice}; +use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bytemuck::{NoUninit, CheckedBitPattern}; use crossbeam::sync::{Parker, Unparker}; -use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; +use crossbeam_channel::{IntoIter, Receiver, SendError}; use heed::types::Bytes; use heed::BytesDecode; use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::ref_cell_ext::RefCellExt; use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; @@ -16,7 +21,7 @@ use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::index::{db_name, IndexEmbeddingConfig}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of producer/receivers to be used by /// the extractors and the writer loop. @@ -26,125 +31,97 @@ use crate::{DocumentId, Index}; /// Panics if the number of provided bbqueue is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( - bbqueue: &[bbqueue::BBBuffer], + bbbuffers: &[bbqueue::BBBuffer], ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { assert_eq!( - bbqueue.len(), + bbbuffers.len(), rayon::current_num_threads(), "You must provide as many BBBuffer as the available number of threads to extract" ); + let capacity = bbbuffers.first().unwrap().capacity(); let parker = Parker::new(); - let extractors = ThreadLocal::with_capacity(bbqueue.len()); + let extractors = ThreadLocal::with_capacity(bbbuffers.len()); let producers = rayon::broadcast(|bi| { - let bbqueue = &bbqueue[bi.index()]; + let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); - extractors.get_or(|| FullySend(producer)); + extractors.get_or(|| FullySend(RefCell::new(producer))); consumer }); ( - ExtractorBbqueueSender { inner: extractors, unparker: parker.unparker().clone() }, + ExtractorBbqueueSender { + inner: extractors, + capacity: capacity.checked_sub(9).unwrap(), + unparker: parker.unparker().clone(), + }, WriterBbqueueReceiver { inner: producers, parker }, ) } -pub struct ExtractorBbqueueSender<'a> { - inner: ThreadLocal>>, - /// Used to wake up the receiver thread, - /// Used everytime we write something in the producer. - unparker: Unparker, -} - pub struct WriterBbqueueReceiver<'a> { inner: Vec>, /// Used to park when no more work is required parker: Parker, } -/// The capacity of the channel is currently in number of messages. -pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { - let (sender, receiver) = crossbeam_channel::bounded(cap); - ( - ExtractorSender { - sender, - send_count: Default::default(), - writer_contentious_count: Default::default(), - extractor_contentious_count: Default::default(), - }, - WriterReceiver(receiver), - ) -} - -pub enum KeyValueEntry { - Small { key_length: usize, data: Box<[u8]> }, - Large { key_entry: KeyEntry, data: Mmap }, -} - -impl KeyValueEntry { - pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self { - let mut data = Vec::with_capacity(key.len() + value.len()); - data.extend_from_slice(key); - data.extend_from_slice(value); - KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() } - } - - fn from_large_key_value(key: &[u8], value: Mmap) -> Self { - KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value } - } - - pub fn key(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[..*key_length], - KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(), - } - } - - pub fn value(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[*key_length..], - KeyValueEntry::Large { key_entry: _, data } => &data[..], +impl<'a> WriterBbqueueReceiver<'a> { + pub fn read(&mut self) -> Option> { + loop { + for consumer in &mut self.inner { + // mark the frame as auto release + if let Some() = consumer.read() + } + break None; } } } -pub struct KeyEntry { - data: Box<[u8]>, +struct FrameWithHeader<'a> { + header: EntryHeader, + frame: FrameGrantR<'a>, } -impl KeyEntry { - pub fn from_key(key: &[u8]) -> Self { - KeyEntry { data: key.to_vec().into_boxed_slice() } +#[derive(Debug, Clone, Copy, CheckedBitPattern)] +#[repr(u8)] +enum EntryHeader { + /// Wether a put of the key/value pair or a delete of the given key. + DbOperation { + /// The database on which to perform the operation. + database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + key_length: Option, + }, + ArroyDeleteVector { + docid: DocumentId, + }, + /// The embedding is the remaining space and represents a non-aligned [f32]. + ArroySetVector { + docid: DocumentId, + embedder_id: u8, + }, +} + +impl EntryHeader { + fn delete_key_size(key_length: u16) -> usize { + mem::size_of::() + key_length as usize } - pub fn entry(&self) -> &[u8] { - self.data.as_ref() + fn put_key_value_size(key_length: u16, value_length: usize) -> usize { + mem::size_of::() + key_length as usize + value_length + } + + fn bytes_of(&self) -> &[u8] { + /// TODO do the variant matching ourselves + todo!() } } -pub enum EntryOperation { - Delete(KeyEntry), - Write(KeyValueEntry), -} - -pub enum WriterOperation { - DbOperation(DbOperation), - ArroyOperation(ArroyOperation), -} - -pub enum ArroyOperation { - DeleteVectors { docid: DocumentId }, - SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec }, - SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding }, - Finish { configs: Vec }, -} - -pub struct DbOperation { - database: Database, - entry: EntryOperation, -} - -#[derive(Debug)] +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u32)] pub enum Database { Main, Documents, @@ -220,82 +197,46 @@ impl From for Database { } } -impl DbOperation { - pub fn database(&self, index: &Index) -> heed::Database { - self.database.database(index) - } - - pub fn database_name(&self) -> &'static str { - self.database.database_name() - } - - pub fn entry(self) -> EntryOperation { - self.entry - } +pub struct ExtractorBbqueueSender<'a> { + inner: ThreadLocal>>>, + /// The capacity of this frame producer, will never be able to store more than that. + /// + /// Note that the FrameProducer requires up to 9 bytes to encode the length, + /// the capacity has been shrinked accordingly. + /// + /// + capacity: usize, + /// Used to wake up the receiver thread, + /// Used everytime we write something in the producer. + unparker: Unparker, } -pub struct WriterReceiver(Receiver); - -impl IntoIterator for WriterReceiver { - type Item = WriterOperation; - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -pub struct ExtractorSender { - sender: Sender, - /// The number of message we sent in total in the channel. - send_count: AtomicUsize, - /// The number of times we sent something in a channel that was full. - writer_contentious_count: AtomicUsize, - /// The number of times we sent something in a channel that was empty. - extractor_contentious_count: AtomicUsize, -} - -impl Drop for ExtractorSender { - fn drop(&mut self) { - let send_count = *self.send_count.get_mut(); - let writer_contentious_count = *self.writer_contentious_count.get_mut(); - let extractor_contentious_count = *self.extractor_contentious_count.get_mut(); - tracing::debug!( - "Extractor channel stats: {send_count} sends, \ - {writer_contentious_count} writer contentions ({}%), \ - {extractor_contentious_count} extractor contentions ({}%)", - (writer_contentious_count as f32 / send_count as f32) * 100.0, - (extractor_contentious_count as f32 / send_count as f32) * 100.0 - ) - } -} - -impl ExtractorSender { - pub fn docids(&self) -> WordDocidsSender<'_, D> { +impl<'b> ExtractorBbqueueSender<'b> { + pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } } - pub fn facet_docids(&self) -> FacetDocidsSender<'_> { + pub fn facet_docids<'a>(&'a self) -> FacetDocidsSender<'a, 'b> { FacetDocidsSender { sender: self } } - pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> { - FieldIdDocidFacetSender(self) + pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { + FieldIdDocidFacetSender(&self) } - pub fn documents(&self) -> DocumentsSender<'_> { - DocumentsSender(self) + pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { + DocumentsSender(&self) } - pub fn embeddings(&self) -> EmbeddingSender<'_> { - EmbeddingSender(&self.sender) + pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { + EmbeddingSender(&self) } - pub fn geo(&self) -> GeoSender<'_> { - GeoSender(&self.sender) + pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { + GeoSender(&self) } - fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + fn send_delete_vector(&self, docid: DocumentId) -> crate::Result<()> { match self .sender .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) @@ -305,18 +246,69 @@ impl ExtractorSender { } } - fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> { - if self.sender.is_full() { - self.writer_contentious_count.fetch_add(1, Ordering::SeqCst); - } - if self.sender.is_empty() { - self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst); + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let value_length = value.len(); + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); } - self.send_count.fetch_add(1, Ordering::SeqCst); - match self.sender.send(WriterOperation::DbOperation(op)) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + let payload_header = + EntryHeader::DbOperation { database, key_length: NonZeroU16::new(key_length) }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + value_out.copy_from_slice(value); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } + + fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { database, key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } } @@ -356,159 +348,237 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } -pub trait DocidsSender { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; -} - -pub struct WordDocidsSender<'a, D> { - sender: &'a ExtractorSender, +pub struct WordDocidsSender<'a, 'b, D> { + sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, } -impl DocidsSender for WordDocidsSender<'_, D> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), +impl WordDocidsSender<'_, '_, D> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); } - } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -pub struct FacetDocidsSender<'a> { - sender: &'a ExtractorSender, -} - -impl DocidsSender for FacetDocidsSender<'_> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = match facet_kind { - // skip level group size - FacetKind::String | FacetKind::Number => { - // add facet group size - let value = [&[1], value].concat(); - EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value)) - } - _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), + let payload_header = EntryHeader::DbOperation { + database: D::DATABASE, + key_length: NonZeroU16::new(key_length), }; - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { database: D::DATABASE, key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } +} + +pub struct FacetDocidsSender<'a, 'b> { + sender: &'a ExtractorBbqueueSender<'b>, +} + +impl FacetDocidsSender<'_, '_> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + let key_length = key.len().try_into().unwrap(); + + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let value_length = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::Number | FacetKind::String => value_length + 1, + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, + }; + + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { + database: Database::from(facet_kind), + key_length: NonZeroU16::new(key_length), + }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } + + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let (facet_kind, key) = FacetKind::extract_from_key(key); + let key_length = key.len().try_into().unwrap(); + + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = + EntryHeader::DbOperation { database: Database::from(facet_kind), key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } } -pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); +pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl FieldIdDocidFacetSender<'_> { - pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { +impl FieldIdDocidFacetSender<'_, '_> { + pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetStrings, key, value) } - pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn write_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[])); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetF64s, key, &[]) } - pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_string(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetStrings, key) } - pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetF64s, key) } } -pub struct DocumentsSender<'a>(&'a ExtractorSender); +pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl DocumentsSender<'_> { +impl DocumentsSender<'_, '_> { /// TODO do that efficiently pub fn uncompressed( &self, docid: DocumentId, external_id: String, document: &KvReaderFieldId, - ) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( - &docid.to_be_bytes(), - document.as_bytes(), - )); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + ) -> crate::Result<()> { + self.0.write_key_value(Database::Documents, &docid.to_be_bytes(), document.as_bytes())?; + self.0.write_key_value( + Database::ExternalDocumentsIds, external_id.as_bytes(), &docid.to_be_bytes(), - )); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + ) } - pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - + pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { + self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; self.0.send_delete_vector(docid)?; - - let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } -pub struct EmbeddingSender<'a>(&'a Sender); +pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl EmbeddingSender<'_> { +impl EmbeddingSender<'_, '_> { pub fn set_vectors( &self, docid: DocumentId, embedder_id: u8, embeddings: Vec, - ) -> StdResult<(), SendError<()>> { + ) -> crate::Result<()> { self.0 .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { docid, @@ -541,33 +611,36 @@ impl EmbeddingSender<'_> { } } -pub struct GeoSender<'a>(&'a Sender); +pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl GeoSender<'_> { +impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - GEO_RTREE_KEY.as_bytes(), - value, - )), - })) - .map_err(|_| SendError(())) + todo!("set rtree from file") + // self.0 + // .send(WriterOperation::DbOperation(DbOperation { + // database: Database::Main, + // entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( + // GEO_RTREE_KEY.as_bytes(), + // value, + // )), + // })) + // .map_err(|_| SendError(())) } pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - let mut buffer = Vec::new(); - bitmap.serialize_into(&mut buffer).unwrap(); + todo!("serialize directly into bbqueue (as a real roaringbitmap not a cbo)") - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - &buffer, - )), - })) - .map_err(|_| SendError(())) + // let mut buffer = Vec::new(); + // bitmap.serialize_into(&mut buffer).unwrap(); + + // self.0 + // .send(WriterOperation::DbOperation(DbOperation { + // database: Database::Main, + // entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( + // GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), + // &buffer, + // )), + // })) + // .map_err(|_| SendError(())) } } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index aeb1d5694..13307025a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -12,13 +12,14 @@ use crate::update::new::thread_local::FullySend; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; -pub struct DocumentsExtractor<'a> { - document_sender: &'a DocumentsSender<'a>, + +pub struct DocumentsExtractor<'a, 'b> { + document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs, } -impl<'a> DocumentsExtractor<'a> { - pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { +impl<'a, 'b> DocumentsExtractor<'a, 'b> { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { Self { document_sender, embedders } } } @@ -29,7 +30,7 @@ pub struct DocumentExtractorData { pub field_distribution_delta: HashMap, } -impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { type Data = FullySend>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 8ac73a8d7..52b13f37d 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -20,7 +20,7 @@ use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAb pub struct EmbeddingExtractor<'a> { embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } @@ -28,7 +28,7 @@ pub struct EmbeddingExtractor<'a> { impl<'a> EmbeddingExtractor<'a> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -368,7 +368,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: &EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 35dea7a98..88a4c2f77 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -76,7 +76,11 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + /// TODO restrict memory and remove this memory from the extractors bum allocators + let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) + .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread + .collect(); + let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -115,7 +119,7 @@ where // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 039c56b9d..f2809b376 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -19,7 +19,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>( datastore: impl IntoIterator>>, rtxn: &RoTxn, index: &Index, - geo_sender: GeoSender<'_>, + geo_sender: GeoSender<'_, '_>, must_stop_processing: &MSP, ) -> Result<()> where @@ -62,19 +62,19 @@ where } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor, MSP>( +pub fn merge_and_send_docids<'extractor, MSP, D>( mut caches: Vec>, database: Database, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: WordDocidsSender, must_stop_processing: &MSP, ) -> Result<()> where MSP: Fn() -> bool + Sync, + D: DatabaseType + Sync, { transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } @@ -82,8 +82,7 @@ where let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap).unwrap(); Ok(()) } Operation::Delete => { @@ -101,21 +100,19 @@ pub fn merge_and_send_facet_docids<'extractor>( mut caches: Vec>, database: FacetDatabases, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: FacetDocidsSender, ) -> Result { transpose_and_freeze_caches(&mut caches)? .into_par_iter() .map(|frozen| { let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap).unwrap(); Ok(()) } Operation::Delete => { From 2094ce8a9a8febbab73efdeb0c477cda1c9c67c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 10:19:59 +0100 Subject: [PATCH 023/158] Move the arroy building after the writing loop --- crates/milli/src/update/new/indexer/mod.rs | 81 +++++++++++----------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 88a4c2f77..f82f4af37 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -76,7 +76,7 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - /// TODO restrict memory and remove this memory from the extractors bum allocators + /// TODO restrict memory and remove this memory from the extractors bump allocators let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); @@ -100,6 +100,7 @@ where send_progress, }; + let mut index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; @@ -296,7 +297,6 @@ where 'vectors: { - let mut index_embeddings = index.embedding_configs(&rtxn)?; if index_embeddings.is_empty() { break 'vectors; } @@ -322,8 +322,6 @@ where } } } - - embedding_sender.finish(index_embeddings).unwrap(); } 'geo: { @@ -457,46 +455,47 @@ where embeddings.append(embedding).unwrap(); } - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; - } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = arroy_writers - .get(&embedder_id) - .expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - ArroyOperation::Finish { configs } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); - - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); - - for ( - _embedder_index, - (_embedder_name, _embedder, writer, dimensions), - ) in &mut arroy_writers - { - let dimensions = *dimensions; - writer.build_and_quantize( - wtxn, - &mut rng, - dimensions, - false, - &indexing_context.must_stop_processing, - )?; - } - - index.put_embedding_configs(wtxn, configs)?; - } - }, - } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + ArroyOperation::SetVector { docid, embedder_id, embedding } => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, &embedding)?; + } + _otherwise => unreachable!(), + }, } } + 'vectors: { + let span = + tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + + if index_embeddings.is_empty() { + break 'vectors; + } + + (indexing_context.send_progress)(Progress::from_step( + Step::WritingEmbeddingsToDatabase, + )); + + for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + &mut rng, + dimensions, + false, + &indexing_context.must_stop_processing, + )?; + } + + index.put_embedding_configs(wtxn, index_embeddings)?; + } + (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); let facet_field_ids_delta = extractor_handle.join().unwrap()?; From e1e76f39d044d083bd7bf0552cc20b36d948af7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:03:39 +0100 Subject: [PATCH 024/158] Clean up dependencies --- Cargo.lock | 21 ++++----------------- Cargo.toml | 3 --- crates/benchmarks/Cargo.toml | 2 +- crates/dump/Cargo.toml | 2 +- crates/index-scheduler/Cargo.toml | 4 ++-- crates/index-scheduler/src/lib.rs | 10 +++++----- crates/meilisearch-auth/Cargo.toml | 2 +- crates/meilisearch-types/Cargo.toml | 2 +- crates/meilisearch/Cargo.toml | 2 +- crates/milli/Cargo.toml | 3 +-- 10 files changed, 17 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2069db87..8a0a6b3d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1251,19 +1251,6 @@ dependencies = [ "itertools 0.10.5", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -2621,7 +2608,7 @@ dependencies = [ "big_s", "bincode", "bumpalo", - "crossbeam", + "crossbeam-channel", "csv", "derive_builder 0.20.0", "dump", @@ -3629,7 +3616,6 @@ dependencies = [ "candle-transformers", "charabia", "concat-arrays", - "crossbeam", "crossbeam-channel", "csv", "deserr", @@ -4750,8 +4736,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#8ff028e484fb6192a0acf5a669eaf18c30cada6e" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81dc953b2244ddd5e7860cb0bb2a790494b898ef321d4aff8e260efab60cc88" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 5e53dbfa5..89a17d8fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,3 @@ opt-level = 3 opt-level = 3 [profile.dev.package.roaring] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index eec30ea3f..ccd256546 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -24,7 +24,7 @@ tempfile = "3.14.0" criterion = { version = "0.5.1", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.10.6" +roaring = "0.10.7" [build-dependencies] anyhow = "1.0.86" diff --git a/crates/dump/Cargo.toml b/crates/dump/Cargo.toml index f9d2a9a0b..679a97b4e 100644 --- a/crates/dump/Cargo.toml +++ b/crates/dump/Cargo.toml @@ -17,7 +17,7 @@ http = "1.1.0" meilisearch-types = { path = "../meilisearch-types" } once_cell = "1.19.0" regex = "1.10.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } tar = "0.4.41" diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 657dd6dfe..ad4c1b4b9 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -24,7 +24,7 @@ meilisearch-types = { path = "../meilisearch-types" } page_size = "0.6.0" raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" @@ -45,7 +45,7 @@ bumpalo = "3.16.0" [dev-dependencies] arroy = "0.5.0" big_s = "1.0.2" -crossbeam = "0.8.4" +crossbeam-channel = "0.5.13" insta = { version = "1.39.0", features = ["json", "redactions"] } maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index cef24c1ea..1a1c71bae 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -407,7 +407,7 @@ pub struct IndexScheduler { /// /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation. #[cfg(test)] - test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler. /// @@ -476,7 +476,7 @@ impl IndexScheduler { /// Create an index scheduler and start its run loop. pub fn new( options: IndexSchedulerOptions, - #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + #[cfg(test)] test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>, ) -> Result { std::fs::create_dir_all(&options.tasks_path)?; @@ -2237,7 +2237,7 @@ mod tests { use std::time::Instant; use big_s::S; - use crossbeam::channel::RecvTimeoutError; + use crossbeam_channel::RecvTimeoutError; use file_store::File; use insta::assert_json_snapshot; use maplit::btreeset; @@ -2289,7 +2289,7 @@ mod tests { configuration: impl Fn(&mut IndexSchedulerOptions), ) -> (Self, IndexSchedulerHandle) { let tempdir = TempDir::new().unwrap(); - let (sender, receiver) = crossbeam::channel::bounded(0); + let (sender, receiver) = crossbeam_channel::bounded(0); let indexer_config = IndexerConfig { skip_index_budget: true, ..Default::default() }; @@ -2421,7 +2421,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, index_scheduler: IndexScheduler, - test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, + test_breakpoint_rcv: crossbeam_channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } diff --git a/crates/meilisearch-auth/Cargo.toml b/crates/meilisearch-auth/Cargo.toml index ae0095ab4..591a40158 100644 --- a/crates/meilisearch-auth/Cargo.toml +++ b/crates/meilisearch-auth/Cargo.toml @@ -17,7 +17,7 @@ hmac = "0.12.1" maplit = "1.0.2" meilisearch-types = { path = "../meilisearch-types" } rand = "0.8.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 349c06080..aca06a018 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -25,7 +25,7 @@ fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" serde_json = "1.0.120" diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 2884f0c9c..8e134ebd0 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -103,7 +103,7 @@ tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } -roaring = "0.10.2" +roaring = "0.10.7" mopa-maintained = "0.2.3" [dev-dependencies] diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 798a4ea19..b66dec9a4 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -42,7 +42,7 @@ obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } rstar = { version = "0.12.0", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } @@ -99,7 +99,6 @@ rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" bbqueue = { git = "https://github.com/kerollmops/bbqueue" } -crossbeam = "0.8.4" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } From 6ac5b3b136086b8b25b1eb8dc2d6678e39846262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:36:30 +0100 Subject: [PATCH 025/158] Finish most of the channels types --- crates/milli/src/error.rs | 9 +- crates/milli/src/update/new/channel.rs | 662 +++++++++++------- .../src/update/new/extract/vectors/mod.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 132 ++-- 4 files changed, 474 insertions(+), 331 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 4da57a3e1..800dfa375 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -62,9 +62,14 @@ pub enum InternalError { #[error(transparent)] Store(#[from] MdbError), #[error("Cannot delete {key:?} from database {database_name}: {error}")] - StoreDeletion { database_name: &'static str, key: Vec, error: heed::Error }, + StoreDeletion { database_name: &'static str, key: Box<[u8]>, error: heed::Error }, #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] - StorePut { database_name: &'static str, key: Vec, value_length: usize, error: heed::Error }, + StorePut { + database_name: &'static str, + key: Box<[u8]>, + value_length: usize, + error: heed::Error, + }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index cacc7b129..d2681c915 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,12 +1,11 @@ use std::cell::RefCell; use std::marker::PhantomData; +use std::mem; use std::num::NonZeroU16; -use std::{mem, slice}; use bbqueue::framed::{FrameGrantR, FrameProducer}; -use bytemuck::{NoUninit, CheckedBitPattern}; -use crossbeam::sync::{Parker, Unparker}; -use crossbeam_channel::{IntoIter, Receiver, SendError}; +use bytemuck::{checked, CheckedBitPattern, NoUninit}; +use crossbeam_channel::SendError; use heed::types::Bytes; use heed::BytesDecode; use memmap2::Mmap; @@ -17,21 +16,32 @@ use super::ref_cell_ext::RefCellExt; use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; -use crate::index::{db_name, IndexEmbeddingConfig}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{CboRoaringBitmapCodec, DocumentId, Index}; -/// Creates a tuple of producer/receivers to be used by +/// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// +/// The `channel_capacity` parameter defines the number of +/// too-large-to-fit-in-BBQueue entries that can be sent through +/// a crossbeam channel. This parameter must stay low to make +/// sure we do not use too much memory. +/// +/// Note that the channel is also used to wake-up the receiver +/// wehn new stuff is available in any BBQueue buffer but we send +/// a message in this queue only if it is empty to avoid filling +/// the channel *and* the BBQueue. +/// /// # Safety /// -/// Panics if the number of provided bbqueue is not exactly equal +/// Panics if the number of provided BBQueues is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &[bbqueue::BBBuffer], + channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { assert_eq!( bbbuffers.len(), @@ -40,88 +50,252 @@ pub fn extractor_writer_bbqueue( ); let capacity = bbbuffers.first().unwrap().capacity(); - let parker = Parker::new(); - let extractors = ThreadLocal::with_capacity(bbbuffers.len()); - let producers = rayon::broadcast(|bi| { + // Read the field description to understand this + let capacity = capacity.checked_sub(9).unwrap(); + + let producers = ThreadLocal::with_capacity(bbbuffers.len()); + let consumers = rayon::broadcast(|bi| { let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); - extractors.get_or(|| FullySend(RefCell::new(producer))); + producers.get_or(|| FullySend(RefCell::new(producer))); consumer }); - ( - ExtractorBbqueueSender { - inner: extractors, - capacity: capacity.checked_sub(9).unwrap(), - unparker: parker.unparker().clone(), - }, - WriterBbqueueReceiver { inner: producers, parker }, - ) + let (sender, receiver) = crossbeam_channel::bounded(channel_capacity); + let sender = ExtractorBbqueueSender { sender, producers, capacity }; + let receiver = WriterBbqueueReceiver { receiver, consumers }; + (sender, receiver) +} + +pub struct ExtractorBbqueueSender<'a> { + /// This channel is used to wake-up the receiver and + /// send large entries that cannot fit in the BBQueue. + sender: crossbeam_channel::Sender, + /// A memory buffer, one by thread, is used to serialize + /// the entries directly in this shared, lock-free space. + producers: ThreadLocal>>>, + /// The capacity of this frame producer, will never be able to store more than that. + /// + /// Note that the FrameProducer requires up to 9 bytes to encode the length, + /// the capacity has been shrinked accordingly. + /// + /// + capacity: usize, } pub struct WriterBbqueueReceiver<'a> { - inner: Vec>, - /// Used to park when no more work is required - parker: Parker, + /// Used to wake up when new entries are available either in + /// any BBQueue buffer or directly sent throught this channel + /// (still written to disk). + receiver: crossbeam_channel::Receiver, + /// The BBQueue frames to read when waking-up. + consumers: Vec>, +} + +/// The action to perform on the receiver/writer side. +pub enum ReceiverAction { + /// Wake up, you have frames to read for the BBQueue buffers. + WakeUp, + /// An entry that cannot fit in the BBQueue buffers has been + /// written to disk, memory-mapped and must be written in the + /// database. + LargeEntry { + /// The database where the entry must be written. + database: Database, + /// The key of the entry that must be written in the database. + key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + value: Mmap, + }, } impl<'a> WriterBbqueueReceiver<'a> { + pub fn recv(&mut self) -> Option { + self.receiver.recv().ok() + } + pub fn read(&mut self) -> Option> { - loop { - for consumer in &mut self.inner { - // mark the frame as auto release - if let Some() = consumer.read() + for consumer in &mut self.consumers { + if let Some(frame) = consumer.read() { + return Some(FrameWithHeader::from(frame)); } - break None; } + None } } -struct FrameWithHeader<'a> { +pub struct FrameWithHeader<'a> { header: EntryHeader, frame: FrameGrantR<'a>, } -#[derive(Debug, Clone, Copy, CheckedBitPattern)] -#[repr(u8)] -enum EntryHeader { - /// Wether a put of the key/value pair or a delete of the given key. - DbOperation { - /// The database on which to perform the operation. - database: Database, - /// The key length in the buffer. - /// - /// If None it means that the buffer is dedicated - /// to the key and it is therefore a deletion operation. - key_length: Option, - }, - ArroyDeleteVector { - docid: DocumentId, - }, - /// The embedding is the remaining space and represents a non-aligned [f32]. - ArroySetVector { - docid: DocumentId, - embedder_id: u8, - }, +impl FrameWithHeader<'_> { + pub fn header(&self) -> EntryHeader { + self.header + } + + pub fn frame(&self) -> &FrameGrantR<'_> { + &self.frame + } } -impl EntryHeader { - fn delete_key_size(key_length: u16) -> usize { - mem::size_of::() + key_length as usize - } - - fn put_key_value_size(key_length: u16, value_length: usize) -> usize { - mem::size_of::() + key_length as usize + value_length - } - - fn bytes_of(&self) -> &[u8] { - /// TODO do the variant matching ourselves - todo!() +impl<'a> From> for FrameWithHeader<'a> { + fn from(mut frame: FrameGrantR<'a>) -> Self { + frame.auto_release(true); + FrameWithHeader { header: EntryHeader::from_slice(&frame[..]), frame } } } #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(u32)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + /// TODO replace the return type by an enum Write | Delete + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embedding is the remaining space and represents a non-aligned [f32]. +pub struct ArroySetVector { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVector { + pub fn read_embedding_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + vec.clear(); + let skip = EntryHeader::variant_size() + mem::size_of::(); + let bytes = &frame[skip..]; + bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + &vec[..] + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(u8)] +pub enum EntryHeader { + DbOperation(DbOperation), + ArroyDeleteVector(ArroyDeleteVector), + ArroySetVector(ArroySetVector), +} + +impl EntryHeader { + const fn variant_size() -> usize { + mem::size_of::() + } + + const fn variant_id(&self) -> u8 { + match self { + EntryHeader::DbOperation(_) => 0, + EntryHeader::ArroyDeleteVector(_) => 1, + EntryHeader::ArroySetVector(_) => 2, + } + } + + const fn total_key_value_size(key_length: NonZeroU16, value_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + key_length.get() as usize + + value_length + } + + const fn total_key_size(key_length: NonZeroU16) -> usize { + Self::total_key_value_size(key_length, 0) + } + + const fn total_delete_vector_size() -> usize { + Self::variant_size() + mem::size_of::() + } + + /// The `embedding_length` corresponds to the number of `f32` in the embedding. + fn total_set_vector_size(embedding_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + embedding_length * mem::size_of::() + } + + fn header_size(&self) -> usize { + let payload_size = match self { + EntryHeader::DbOperation(op) => mem::size_of_val(op), + EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), + EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), + }; + Self::variant_size() + payload_size + } + + fn from_slice(slice: &[u8]) -> EntryHeader { + let (variant_id, remaining) = slice.split_first().unwrap(); + match variant_id { + 0 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::DbOperation(header) + } + 1 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroyDeleteVector(header) + } + 2 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVector(header) + } + id => panic!("invalid variant id: {id}"), + } + } + + fn serialize_into(&self, header_bytes: &mut [u8]) { + let (first, remaining) = header_bytes.split_first_mut().unwrap(); + let payload_bytes = match self { + EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), + EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), + EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), + }; + *first = self.variant_id(); + remaining.copy_from_slice(payload_bytes); + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u16)] pub enum Database { Main, Documents, @@ -197,20 +371,6 @@ impl From for Database { } } -pub struct ExtractorBbqueueSender<'a> { - inner: ThreadLocal>>>, - /// The capacity of this frame producer, will never be able to store more than that. - /// - /// Note that the FrameProducer requires up to 9 bytes to encode the length, - /// the capacity has been shrinked accordingly. - /// - /// - capacity: usize, - /// Used to wake up the receiver thread, - /// Used everytime we write something in the producer. - unparker: Unparker, -} - impl<'b> ExtractorBbqueueSender<'b> { pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } @@ -236,80 +396,171 @@ impl<'b> ExtractorBbqueueSender<'b> { GeoSender(&self) } - fn send_delete_vector(&self, docid: DocumentId) -> crate::Result<()> { - match self - .sender - .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); + let total_length = EntryHeader::total_delete_vector_size(); + if total_length > capacity { + unreachable!("entry larger that the BBQueue capacity"); } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + payload_header.serialize_into(&mut grant); + + // We could commit only the used memory. + grant.commit(total_length); + + Ok(()) + } + + fn set_vector( + &self, + docid: DocumentId, + embedder_id: u8, + embedding: &[f32], + ) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = + EntryHeader::ArroySetVector(ArroySetVector { docid, embedder_id, _padding: [0; 3] }); + let total_length = EntryHeader::total_set_vector_size(embedding.len()); + if total_length > capacity { + unreachable!("entry larger that the BBQueue capacity"); + } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + // payload_header.serialize_into(&mut grant); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + remaining.copy_from_slice(bytemuck::cast_slice(embedding)); + + // We could commit only the used memory. + grant.commit(total_length); + + Ok(()) } fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + self.write_key_value_with(database, key_length, value.len(), |buffer| { + let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + key_buffer.copy_from_slice(key); + value_buffer.copy_from_slice(value); + Ok(()) + }) + } + + fn write_key_value_with( + &self, + database: Database, + key_length: NonZeroU16, + value_length: usize, + key_value_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { let capacity = self.capacity; - let refcell = self.inner.get().unwrap(); + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let key_length = key.len().try_into().unwrap(); - let value_length = value.len(); - let total_length = EntryHeader::put_key_value_size(key_length, value_length); + let operation = DbOperation { database, key_length: Some(key_length) }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_value_size(key_length, value_length); if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); + unreachable!("entry larger that the BBQueue capacity"); } - let payload_header = - EntryHeader::DbOperation { database, key_length: NonZeroU16::new(key_length) }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), - }; + } + }; - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); - key_out.copy_from_slice(key); - value_out.copy_from_slice(value); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_value_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); + // We could commit only the used memory. + grant.commit(total_length); - break Ok(()); - } + Ok(()) } fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + self.delete_entry_with(database, key_length, |buffer| { + buffer.copy_from_slice(key); + Ok(()) + }) + } + + fn delete_entry_with( + &self, + database: Database, + key_length: NonZeroU16, + key_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { let capacity = self.capacity; - let refcell = self.inner.get().unwrap(); + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let key_length = key.len().try_into().unwrap(); - let total_length = EntryHeader::delete_key_size(key_length); + // For deletion we do not specify the key length, + // it's in the remaining bytes. + let operation = DbOperation { database, key_length: None }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_size(key_length); if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); + unreachable!("entry larger that the BBQueue capacity"); } - let payload_header = EntryHeader::DbOperation { database, key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), - }; + } + }; - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); + // We could commit only the used memory. + grant.commit(total_length); - break Ok(()); - } + Ok(()) } } @@ -355,72 +606,18 @@ pub struct WordDocidsSender<'a, 'b, D> { impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let key_length = key.len().try_into().unwrap(); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); - - let total_length = EntryHeader::put_key_value_size(key_length, value_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { - database: D::DATABASE, - key_length: NonZeroU16::new(key_length), - }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); - key_out.copy_from_slice(key); - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + self.sender.write_key_value_with(D::DATABASE, key_length, value_length, |buffer| { + let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let key_length = key.len().try_into().unwrap(); - let total_length = EntryHeader::delete_key_size(key_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { database: D::DATABASE, key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + self.sender.delete_entry(D::DATABASE, key) } } @@ -430,13 +627,10 @@ pub struct FacetDocidsSender<'a, 'b> { impl FacetDocidsSender<'_, '_> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - let (facet_kind, key) = FacetKind::extract_from_key(key); - let key_length = key.len().try_into().unwrap(); + let database = Database::from(facet_kind); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); let value_length = match facet_kind { // We must take the facet group size into account @@ -445,26 +639,8 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; - let total_length = EntryHeader::put_key_value_size(key_length, value_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { - database: Database::from(facet_kind), - key_length: NonZeroU16::new(key_length), - }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); + self.sender.write_key_value_with(database, key_length, value_length, |buffer| { + let (key_out, value_out) = buffer.split_at_mut(key.len()); key_out.copy_from_slice(key); let value_out = match facet_kind { @@ -477,47 +653,17 @@ impl FacetDocidsSender<'_, '_> { } FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, }; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + Ok(()) + }) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - let (facet_kind, key) = FacetKind::extract_from_key(key); - let key_length = key.len().try_into().unwrap(); - - let total_length = EntryHeader::delete_key_size(key_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = - EntryHeader::DbOperation { database: Database::from(facet_kind), key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + let database = Database::from(facet_kind); + self.sender.delete_entry(database, key) } } @@ -565,7 +711,7 @@ impl DocumentsSender<'_, '_> { pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; - self.0.send_delete_vector(docid)?; + self.0.delete_vector(docid)?; self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } @@ -579,13 +725,10 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embeddings: Vec, ) -> crate::Result<()> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings, - })) - .map_err(|_| SendError(())) + for embedding in embeddings { + self.set_vector(docid, embedder_id, embedding)?; + } + Ok(()) } pub fn set_vector( @@ -593,21 +736,8 @@ impl EmbeddingSender<'_, '_> { docid: DocumentId, embedder_id: u8, embedding: Embedding, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector { - docid, - embedder_id, - embedding, - })) - .map_err(|_| SendError(())) - } - - /// Marks all embedders as "to be built" - pub fn finish(self, configs: Vec) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vector(docid, embedder_id, &embedding[..]) } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 52b13f37d..42278d443 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -76,7 +76,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.data, &self.possible_embedding_mistakes, self.threads, - self.sender, + &self.sender, &context.doc_alloc, )) } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index f82f4af37..1fd60b610 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -40,7 +40,7 @@ use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs}; use crate::{ Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, @@ -80,7 +80,7 @@ where let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); - let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers); + let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -386,7 +386,11 @@ where }) .collect(); + // Used by by the ArroySetVector to copy the embedding into an + // aligned memory area, required by arroy to accept a new vector. + let mut aligned_embedding = Vec::new(); let mut arroy_writers = arroy_writers?; + { let span = tracing::trace_span!(target: "indexing::write_db", "all"); let _entered = span.enter(); @@ -394,81 +398,85 @@ where let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let mut _entered_post_merge = None; - for operation in writer_receiver { + while let Some(action) = writer_receiver.recv() { if _entered_post_merge.is_none() && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) { _entered_post_merge = Some(span.enter()); } - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - let database_name = db_operation.database_name(); - match db_operation.entry() { - EntryOperation::Delete(e) => match database.delete(wtxn, e.entry()) { - Ok(false) => unreachable!("We tried to delete an unknown key"), - Ok(_) => (), - Err(error) => { - return Err(Error::InternalError( - InternalError::StoreDeletion { - database_name, - key: e.entry().to_owned(), - error, - }, - )); - } - }, - EntryOperation::Write(e) => { - if let Err(error) = database.put(wtxn, e.key(), e.value()) { - return Err(Error::InternalError(InternalError::StorePut { - database_name, - key: e.key().to_owned(), - value_length: e.value().len(), - error, - })); - } - } + + match action { + ReceiverAction::WakeUp => (), + ReceiverAction::LargeEntry { database, key, value } => { + let database_name = database.database_name(); + let database = database.database(index); + if let Err(error) = database.put(wtxn, &key, &value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key, + value_length: value.len(), + error, + })); } } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for ( - _embedder_index, - (_embedder_name, _embedder, writer, dimensions), - ) in &mut arroy_writers - { + } + + while let Some(frame_with_header) = writer_receiver.read() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + unreachable!("We tried to delete an unknown key: {key:?}") + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError( + InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + }, + )); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; writer.del_items(wtxn, dimensions, docid)?; } } - ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings: raw_embeddings, - } => { - let (_, _, writer, dimensions) = arroy_writers - .get(&embedder_id) - .expect("requested a missing embedder"); - - let mut embeddings = Embeddings::new(*dimensions); - for embedding in raw_embeddings { - embeddings.append(embedding).unwrap(); - } - - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; + EntryHeader::ArroySetVector(asv) => { + let ArroySetVector { docid, embedder_id, .. } = asv; + let frame = frame_with_header.frame(); + let embedding = asv.read_embedding_into_vec(frame, &mut aligned_embedding); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - _otherwise => unreachable!(), - }, + } } } + todo!("read the BBQueue once the channel is closed"); + 'vectors: { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); From 70802eb7c72473fb5cb8a1b0258a9a6ab88b81f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:45:47 +0100 Subject: [PATCH 026/158] Fix most issues with the lifetimes --- crates/milli/src/update/new/channel.rs | 7 ++++++ .../new/extract/faceted/extract_facets.rs | 6 ++--- .../src/update/new/extract/vectors/mod.rs | 22 +++++++++---------- crates/milli/src/update/new/indexer/mod.rs | 6 ++--- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index d2681c915..d1d64814e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -93,6 +93,7 @@ pub struct WriterBbqueueReceiver<'a> { } /// The action to perform on the receiver/writer side. +#[derive(Debug)] pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, @@ -599,6 +600,7 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } +#[derive(Clone, Copy)] pub struct WordDocidsSender<'a, 'b, D> { sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, @@ -621,6 +623,7 @@ impl WordDocidsSender<'_, '_, D> { } } +#[derive(Clone, Copy)] pub struct FacetDocidsSender<'a, 'b> { sender: &'a ExtractorBbqueueSender<'b>, } @@ -667,6 +670,7 @@ impl FacetDocidsSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl FieldIdDocidFacetSender<'_, '_> { @@ -691,6 +695,7 @@ impl FieldIdDocidFacetSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl DocumentsSender<'_, '_> { @@ -716,6 +721,7 @@ impl DocumentsSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl EmbeddingSender<'_, '_> { @@ -741,6 +747,7 @@ impl EmbeddingSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl GeoSender<'_, '_> { diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 9ad37d52c..490dada65 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -25,14 +25,14 @@ use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; -pub struct FacetedExtractorData<'a> { +pub struct FacetedExtractorData<'a, 'b> { attributes_to_extract: &'a [&'a str], - sender: &'a FieldIdDocidFacetSender<'a>, + sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: GrenadParameters, buckets: usize, } -impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { type Data = RefCell>; fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 42278d443..1110432fa 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -18,17 +18,17 @@ use crate::vector::error::{ use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; -pub struct EmbeddingExtractor<'a> { +pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a> EmbeddingExtractor<'a> { +impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>( unsafe impl MostlySend for EmbeddingExtractorData<'_> {} -impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -76,7 +76,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.data, &self.possible_embedding_mistakes, self.threads, - &self.sender, + self.sender, &context.doc_alloc, )) } @@ -259,7 +259,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { // Currently this is the case as: // 1. BVec are inside of the bumaplo // 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'extractor> { +struct Chunks<'a, 'b, 'extractor> { texts: BVec<'a, &'a str>, ids: BVec<'a, DocumentId>, @@ -270,11 +270,11 @@ struct Chunks<'a, 'extractor> { possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, } -impl<'a, 'extractor> Chunks<'a, 'extractor> { +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, @@ -284,7 +284,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); @@ -368,7 +368,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 1fd60b610..982868d93 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -80,7 +80,7 @@ where let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); - let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); + let (extractor_sender, mut writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -302,7 +302,7 @@ where } let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); + let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); @@ -363,7 +363,6 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let vector_arroy = index.vector_arroy; - let mut rng = rand::rngs::StdRng::seed_from_u64(42); let indexer_span = tracing::Span::current(); let arroy_writers: Result> = embedders .inner_as_ref() @@ -490,6 +489,7 @@ where Step::WritingEmbeddingsToDatabase, )); + let mut rng = rand::rngs::StdRng::seed_from_u64(42); for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; writer.build_and_quantize( From 08d641336588a51bf7ada203828ba2b9c19123bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:46:41 +0100 Subject: [PATCH 027/158] Fix result types --- crates/milli/src/update/new/extract/faceted/extract_facets.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 490dada65..f2132ce38 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -318,7 +318,7 @@ impl<'doc> DelAddFacetValue<'doc> { docid: DocumentId, sender: &FieldIdDocidFacetSender, doc_alloc: &Bump, - ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { + ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); for ((fid, value), deladd) in self.strings { if let Ok(s) = std::str::from_utf8(&value) { From acec45ad7c3414db493132fd37fdd951b61529b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:59:29 +0100 Subject: [PATCH 028/158] Send a WakeUp when writing data in the BBQueue buffers --- crates/milli/src/update/new/channel.rs | 24 ++++ crates/milli/src/update/new/indexer/mod.rs | 136 +++++++++++++-------- 2 files changed, 107 insertions(+), 53 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index d1d64814e..0a6d37943 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -422,6 +422,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -460,6 +466,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -511,6 +523,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -561,6 +579,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 982868d93..835ee240b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -420,61 +420,27 @@ where } } - while let Some(frame_with_header) = writer_receiver.read() { - match frame_with_header.header() { - EntryHeader::DbOperation(operation) => { - let database_name = operation.database.database_name(); - let database = operation.database.database(index); - let frame = frame_with_header.frame(); - match operation.key_value(frame) { - (key, Some(value)) => { - if let Err(error) = database.put(wtxn, key, value) { - return Err(Error::InternalError(InternalError::StorePut { - database_name, - key: key.into(), - value_length: value.len(), - error, - })); - } - } - (key, None) => match database.delete(wtxn, key) { - Ok(false) => { - unreachable!("We tried to delete an unknown key: {key:?}") - } - Ok(_) => (), - Err(error) => { - return Err(Error::InternalError( - InternalError::StoreDeletion { - database_name, - key: key.into(), - error, - }, - )); - } - }, - } - } - EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { - for (_index, (_name, _embedder, writer, dimensions)) in &mut arroy_writers { - let dimensions = *dimensions; - writer.del_items(wtxn, dimensions, docid)?; - } - } - EntryHeader::ArroySetVector(asv) => { - let ArroySetVector { docid, embedder_id, .. } = asv; - let frame = frame_with_header.frame(); - let embedding = asv.read_embedding_into_vec(frame, &mut aligned_embedding); - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; - } - } - } + // Every time the is a message in the channel we search + // for new entries in the BBQueue buffers. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; } - } - todo!("read the BBQueue once the channel is closed"); + // Once the extractor/writer channel is closed + // we must process the remaining BBQueue messages. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; + } 'vectors: { let span = @@ -548,6 +514,70 @@ where Ok(()) } +/// A function dedicated to manage all the available BBQueue frames. +/// +/// It reads all the available frames, do the corresponding database operations +/// and stops when no frame are available. +fn write_from_bbqueue( + writer_receiver: &mut WriterBbqueueReceiver<'_>, + index: &Index, + wtxn: &mut RwTxn<'_>, + arroy_writers: &HashMap, + aligned_embedding: &mut Vec, +) -> crate::Result<()> { + while let Some(frame_with_header) = writer_receiver.read() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + unreachable!("We tried to delete an unknown key: {key:?}") + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError(InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + })); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + EntryHeader::ArroySetVector(asv) => { + let ArroySetVector { docid, embedder_id, .. } = asv; + let frame = frame_with_header.frame(); + let embedding = asv.read_embedding_into_vec(frame, aligned_embedding); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } + } + } + + Ok(()) +} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn compute_prefix_database( index: &Index, From cc63802115d864ce169a3f86cf669ed356f8167d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 14:58:03 +0100 Subject: [PATCH 029/158] Modify and return the IndexEmbeddings to write them later --- crates/milli/src/update/new/indexer/mod.rs | 25 +++++++++++----------- crates/milli/src/update/new/steps.rs | 4 ++-- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 835ee240b..89c1b850d 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -117,7 +117,6 @@ where let rtxn = index.read_txn()?; - // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.documents(); let document_extractor = DocumentsExtractor::new(document_sender, embedders); @@ -180,10 +179,6 @@ where } { - - - - let WordDocidsCaches { word_docids, word_fid_docids, @@ -296,7 +291,6 @@ where } 'vectors: { - if index_embeddings.is_empty() { break 'vectors; } @@ -308,7 +302,14 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); let _entered = span.enter(); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingEmbeddings, + )?; } { let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); @@ -357,7 +358,7 @@ where finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - Result::Ok(facet_field_ids_delta) + Result::Ok((facet_field_ids_delta, index_embeddings)) })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); @@ -442,6 +443,10 @@ where )?; } + (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); + + let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; + 'vectors: { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); @@ -470,10 +475,6 @@ where index.put_embedding_configs(wtxn, index_embeddings)?; } - (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); - - let facet_field_ids_delta = extractor_handle.join().unwrap()?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); if index.facet_search(wtxn)? { diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index 7c2441933..bee1be260 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -11,8 +11,8 @@ pub enum Step { ExtractingEmbeddings, WritingGeoPoints, WritingToDatabase, - WritingEmbeddingsToDatabase, WaitingForExtractors, + WritingEmbeddingsToDatabase, PostProcessingFacets, PostProcessingWords, Finalizing, @@ -29,8 +29,8 @@ impl Step { Step::ExtractingEmbeddings => "extracting embeddings", Step::WritingGeoPoints => "writing geo points", Step::WritingToDatabase => "writing to database", - Step::WritingEmbeddingsToDatabase => "writing embeddings to database", Step::WaitingForExtractors => "waiting for extractors", + Step::WritingEmbeddingsToDatabase => "writing embeddings to database", Step::PostProcessingFacets => "post-processing facets", Step::PostProcessingWords => "post-processing words", Step::Finalizing => "finalizing", From a514ce472acfb6bbe329f01ad3be27f0c487bb20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 14:59:04 +0100 Subject: [PATCH 030/158] Make clippy happy --- crates/milli/src/update/new/channel.rs | 8 ++++---- crates/milli/src/update/new/merger.rs | 7 ------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 0a6d37943..fc05baa89 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -382,19 +382,19 @@ impl<'b> ExtractorBbqueueSender<'b> { } pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { - FieldIdDocidFacetSender(&self) + FieldIdDocidFacetSender(self) } pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { - DocumentsSender(&self) + DocumentsSender(self) } pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { - EmbeddingSender(&self) + EmbeddingSender(self) } pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { - GeoSender(&self) + GeoSender(self) } fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index f2809b376..f8af84177 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -249,10 +249,3 @@ fn merge_cbo_bitmaps( } } } - -/// TODO Return the slice directly from the serialize_into method -fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(bitmap, buffer); - buffer.as_slice() -} From 98d4a2909e85c8ec5ba1a6dd4b2a6b2d63cf42c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 16:05:44 +0100 Subject: [PATCH 031/158] Fix the way we spawn the rayon threadpool --- crates/index-scheduler/src/batch.rs | 110 +++--- crates/milli/src/update/new/channel.rs | 36 +- crates/milli/src/update/new/indexer/mod.rs | 440 +++++++++++---------- 3 files changed, 313 insertions(+), 273 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 04cdb912f..bec1fedf5 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1351,7 +1351,10 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; @@ -1399,21 +1402,19 @@ impl IndexScheduler { } if tasks.iter().any(|res| res.error.is_none()) { - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - primary_key, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1489,34 +1490,34 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; - pool.install(|| { - let indexer = - UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = indexer.into_changes(&primary_key)?; - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = + pool.install(|| indexer.into_changes(&primary_key)).unwrap()?; - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; - Result::Ok(()) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1641,7 +1642,10 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; @@ -1652,21 +1656,19 @@ impl IndexScheduler { let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index fc05baa89..beba80ac8 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -55,6 +55,12 @@ pub fn extractor_writer_bbqueue( let producers = ThreadLocal::with_capacity(bbbuffers.len()); let consumers = rayon::broadcast(|bi| { + eprintln!( + "hello thread #{:?} (#{:?}, #{:?})", + bi.index(), + std::thread::current().name(), + std::thread::current().id(), + ); let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); producers.get_or(|| FullySend(RefCell::new(producer))); @@ -399,7 +405,15 @@ impl<'b> ExtractorBbqueueSender<'b> { fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); @@ -438,7 +452,15 @@ impl<'b> ExtractorBbqueueSender<'b> { embedding: &[f32], ) -> crate::Result<()> { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = @@ -496,7 +518,15 @@ impl<'b> ExtractorBbqueueSender<'b> { F: FnOnce(&mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let operation = DbOperation { database, key_length: Some(key_length) }; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 89c1b850d..b7d5431b4 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -62,6 +62,7 @@ mod update_by_function; pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( wtxn: &mut RwTxn, index: &'index Index, + pool: &ThreadPoolNoAbort, grenad_parameters: GrenadParameters, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, @@ -77,10 +78,15 @@ where SP: Fn(Progress) + Sync, { /// TODO restrict memory and remove this memory from the extractors bump allocators - let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) - .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread - .collect(); - let (extractor_sender, mut writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); + let bbbuffers: Vec<_> = pool + .install(|| { + (0..rayon::current_num_threads()) + .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread + .collect() + }) + .unwrap(); + let (extractor_sender, mut writer_receiver) = + pool.install(|| extractor_writer_bbqueue(&bbbuffers, 1000)).unwrap(); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -112,253 +118,255 @@ where let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); - - let rtxn = index.read_txn()?; - - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(document_sender, embedders); - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + pool.install(move || { + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); - extract(document_changes, - &document_extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingDocuments, - )?; - } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); - let _entered = span.enter(); - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); + + let rtxn = index.read_txn()?; + + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + let _entered = span.enter(); + extract(document_changes, + &document_extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingDocuments, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); + let _entered = span.enter(); + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); } - document_extractor_data.docids_delta.apply_to(document_ids); + + field_distribution.retain(|_, v| *v != 0); } - field_distribution.retain(|_, v| *v != 0); - } + let facet_field_ids_delta; - let facet_field_ids_delta; + { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); + let _entered = span.enter(); - { - let caches = { - let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); - let _entered = span.enter(); + FacetedDocidsExtractor::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), + Step::ExtractingFacets + )? + }; - FacetedDocidsExtractor::run_extraction( + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); + let _entered = span.enter(); + + facet_field_ids_delta = merge_and_send_facet_docids( + caches, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; + } + } + + { + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + + WordDocidsExtractors::run_extraction( grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, - &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets + Step::ExtractingWords )? - }; + }; - { - let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_docids, + index.word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - facet_field_ids_delta = merge_and_send_facet_docids( - caches, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; - } - } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - { - let WordDocidsCaches { - word_docids, - word_fid_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - WordDocidsExtractors::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWords - )? - }; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_docids, - index.word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_fid_docids, - index.word_fid_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + + ::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + Step::ExtractingWordProximity, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); + + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - exact_word_docids, - index.exact_word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + 'vectors: { + if index_embeddings.is_empty() { + break 'vectors; + } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_position_docids, - index.word_position_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - fid_word_count_docids, - index.field_id_word_count_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingEmbeddings, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let caches = { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - - ::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWordProximity, - )? - }; - - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); - let _entered = span.enter(); - - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } - - 'vectors: { - if index_embeddings.is_empty() { - break 'vectors; - } - - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); - let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); - - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingEmbeddings, - )?; - } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); - let _entered = span.enter(); - - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); + } } } } - } - 'geo: { - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { - break 'geo; - }; - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + 'geo: { + let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::WritingGeoPoints + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::WritingGeoPoints + )?; + } + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, )?; } - merge_and_send_rtree( - datastore, - &rtxn, - index, - extractor_sender.geo(), - &indexing_context.must_stop_processing, - )?; - } + (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - - Result::Ok((facet_field_ids_delta, index_embeddings)) + Result::Ok((facet_field_ids_delta, index_embeddings)) + }).unwrap() })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); From e83534a4305963c857423cf03c3612e4e31a2b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 16:27:43 +0100 Subject: [PATCH 032/158] Fix the indexer::index to correctly use the rayon::ThreadPool --- crates/milli/src/update/new/channel.rs | 49 +++++----------------- crates/milli/src/update/new/indexer/mod.rs | 17 ++++---- 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index beba80ac8..70c4a6042 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -4,6 +4,7 @@ use std::mem; use std::num::NonZeroU16; use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; use crossbeam_channel::SendError; use heed::types::Bytes; @@ -25,6 +26,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// +/// The `bbqueue_capacity` represent the number of bytes allocated +/// to each BBQueue buffer and is not the sum of all of them. +/// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through /// a crossbeam channel. This parameter must stay low to make @@ -40,14 +44,11 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Panics if the number of provided BBQueues is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( - bbbuffers: &[bbqueue::BBBuffer], + bbbuffers: &mut Vec, + bbbuffer_capacity: usize, channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { - assert_eq!( - bbbuffers.len(), - rayon::current_num_threads(), - "You must provide as many BBBuffer as the available number of threads to extract" - ); + bbbuffers.resize_with(rayon::current_num_threads(), || BBBuffer::new(bbbuffer_capacity)); let capacity = bbbuffers.first().unwrap().capacity(); // Read the field description to understand this @@ -55,12 +56,6 @@ pub fn extractor_writer_bbqueue( let producers = ThreadLocal::with_capacity(bbbuffers.len()); let consumers = rayon::broadcast(|bi| { - eprintln!( - "hello thread #{:?} (#{:?}, #{:?})", - bi.index(), - std::thread::current().name(), - std::thread::current().id(), - ); let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); producers.get_or(|| FullySend(RefCell::new(producer))); @@ -405,15 +400,7 @@ impl<'b> ExtractorBbqueueSender<'b> { fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); @@ -452,15 +439,7 @@ impl<'b> ExtractorBbqueueSender<'b> { embedding: &[f32], ) -> crate::Result<()> { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = @@ -518,15 +497,7 @@ impl<'b> ExtractorBbqueueSender<'b> { F: FnOnce(&mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let operation = DbOperation { database, key_length: Some(key_length) }; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index b7d5431b4..3a4406aef 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -77,17 +77,18 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - /// TODO restrict memory and remove this memory from the extractors bump allocators - let bbbuffers: Vec<_> = pool + let mut bbbuffers = Vec::new(); + let finished_extraction = AtomicBool::new(false); + let (extractor_sender, mut writer_receiver) = pool .install(|| { - (0..rayon::current_num_threads()) - .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread - .collect() + /// TODO restrict memory and remove this memory from the extractors bump allocators + extractor_writer_bbqueue( + &mut bbbuffers, + 100 * 1024 * 1024, // 100 MiB + 1000, + ) }) .unwrap(); - let (extractor_sender, mut writer_receiver) = - pool.install(|| extractor_writer_bbqueue(&bbbuffers, 1000)).unwrap(); - let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); From da650f834ee4fcb12d4a38a0e545f548bb06660f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 17:04:49 +0100 Subject: [PATCH 033/158] Plug the NoPanicThreadPool in the tests and benchmarks --- crates/benchmarks/benches/indexing.rs | 31 +++++++++++++++++++ crates/benchmarks/benches/utils.rs | 1 + crates/fuzzers/src/bin/fuzz-indexing.rs | 1 + crates/milli/src/index.rs | 3 ++ .../milli/src/search/new/tests/integration.rs | 1 + .../milli/src/update/index_documents/mod.rs | 11 +++++++ .../milli/tests/search/facet_distribution.rs | 1 + crates/milli/tests/search/mod.rs | 1 + crates/milli/tests/search/query_criteria.rs | 1 + crates/milli/tests/search/typo_tolerance.rs | 1 + 10 files changed, 52 insertions(+) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 2f33c3454..d3f307be3 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -157,6 +157,7 @@ fn indexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -223,6 +224,7 @@ fn reindexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -267,6 +269,7 @@ fn reindexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -335,6 +338,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -411,6 +415,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -455,6 +460,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -495,6 +501,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -562,6 +569,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -628,6 +636,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -694,6 +703,7 @@ fn indexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -759,6 +769,7 @@ fn reindexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -803,6 +814,7 @@ fn reindexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -870,6 +882,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -946,6 +959,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -991,6 +1005,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1032,6 +1047,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1098,6 +1114,7 @@ fn indexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1163,6 +1180,7 @@ fn reindexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1207,6 +1225,7 @@ fn reindexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1274,6 +1293,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1321,6 +1341,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index f335938b9..ee927940f 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -135,6 +135,7 @@ fn main() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index fe83877a7..268d33cd9 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1821,6 +1821,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1911,6 +1912,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1991,6 +1993,7 @@ pub(crate) mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 79668b34b..5db5b400b 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -83,6 +83,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 186cc501d..3988b311c 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -2155,6 +2155,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2216,6 +2217,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2268,6 +2270,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2319,6 +2322,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2372,6 +2376,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2430,6 +2435,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2481,6 +2487,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2532,6 +2539,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2725,6 +2733,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2783,6 +2792,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2838,6 +2848,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 61d0697ff..418cdc356 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -64,6 +64,7 @@ fn test_facet_distribution_with_no_facet_values() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 1287b59d5..08b22d7b6 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -101,6 +101,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 3e56eeff0..8401f0444 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -333,6 +333,7 @@ fn criteria_ascdesc() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 7ac9a1e4b..dbee296ee 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -142,6 +142,7 @@ fn test_typo_disabled_on_word() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, From 5c488e20cc07a66aff3794fd94c3c84d47170b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 18:03:45 +0100 Subject: [PATCH 034/158] Send the geo rtree through crossbeam channel --- crates/milli/src/update/new/channel.rs | 107 +++++++++++++------------ 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 70c4a6042..26e375a5a 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -166,7 +166,6 @@ pub struct DbOperation { impl DbOperation { pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { - /// TODO replace the return type by an enum Write | Delete let skip = EntryHeader::variant_size() + mem::size_of::(); match self.key_length { Some(key_length) => { @@ -478,8 +477,7 @@ impl<'b> ExtractorBbqueueSender<'b> { fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); - self.write_key_value_with(database, key_length, value.len(), |buffer| { - let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { key_buffer.copy_from_slice(key); value_buffer.copy_from_slice(value); Ok(()) @@ -494,7 +492,7 @@ impl<'b> ExtractorBbqueueSender<'b> { key_value_writer: F, ) -> crate::Result<()> where - F: FnOnce(&mut [u8]) -> crate::Result<()>, + F: FnOnce(&mut [u8], &mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; let refcell = self.producers.get().unwrap(); @@ -519,7 +517,8 @@ impl<'b> ExtractorBbqueueSender<'b> { let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); - key_value_writer(remaining)?; + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer)?; // We could commit only the used memory. grant.commit(total_length); @@ -635,12 +634,16 @@ impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); - self.sender.write_key_value_with(D::DATABASE, key_length, value_length, |buffer| { - let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); - key_buffer.copy_from_slice(key); - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; - Ok(()) - }) + self.sender.write_key_value_with( + D::DATABASE, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }, + ) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { @@ -667,25 +670,29 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; - self.sender.write_key_value_with(database, key_length, value_length, |buffer| { - let (key_out, value_out) = buffer.split_at_mut(key.len()); - key_out.copy_from_slice(key); + self.sender.write_key_value_with( + database, + key_length, + value_length, + |key_out, value_out| { + key_out.copy_from_slice(key); - let value_out = match facet_kind { - // We must take the facet group size into account - // when we serialize strings and numbers. - FacetKind::String | FacetKind::Number => { - let (first, remaining) = value_out.split_first_mut().unwrap(); - *first = 1; - remaining - } - FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, - }; + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - Ok(()) - }) + Ok(()) + }, + ) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { @@ -777,32 +784,30 @@ pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { - todo!("set rtree from file") - // self.0 - // .send(WriterOperation::DbOperation(DbOperation { - // database: Database::Main, - // entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - // GEO_RTREE_KEY.as_bytes(), - // value, - // )), - // })) - // .map_err(|_| SendError(())) + self.0 + .sender + .send(ReceiverAction::LargeEntry { + database: Database::Main, + key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), + value, + }) + .map_err(|_| SendError(())) } - pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - todo!("serialize directly into bbqueue (as a real roaringbitmap not a cbo)") + pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let value_length = bitmap.serialized_size(); - // let mut buffer = Vec::new(); - // bitmap.serialize_into(&mut buffer).unwrap(); - - // self.0 - // .send(WriterOperation::DbOperation(DbOperation { - // database: Database::Main, - // entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - // GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - // &buffer, - // )), - // })) - // .map_err(|_| SendError(())) + self.0.write_key_value_with( + Database::Main, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + bitmap.serialize_into(value_buffer)?; + Ok(()) + }, + ) } } From 68c4717e215d12da34789d3e49e5d7223468fc4f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Nov 2024 11:34:35 +0100 Subject: [PATCH 035/158] Change the settings tests and macros to avoid oversights --- .../src/routes/indexes/settings.rs | 514 ++++++++---------- .../tests/settings/get_settings.rs | 35 +- 2 files changed, 274 insertions(+), 275 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index e1794535b..e08047d83 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -17,6 +17,26 @@ use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +macro_rules! make_setting_routes { + ($({$route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident}),*) => { + $( + make_setting_route!($route, $update_verb, $type, $err_ty, $attr, $camelcase_attr, $analytics); + )* + + pub fn configure(cfg: &mut web::ServiceConfig) { + use crate::extractors::sequential_extractor::SeqHandler; + cfg.service( + web::resource("") + .route(web::patch().to(SeqHandler(update_all))) + .route(web::get().to(SeqHandler(get_all))) + .route(web::delete().to(SeqHandler(delete_all)))) + $(.service($attr::resources()))*; + } + + pub const ALL_SETTINGS_NAMES: &[&str] = &[$(stringify!($attr)),*]; + }; +} + #[macro_export] macro_rules! make_setting_route { ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { @@ -153,279 +173,227 @@ macro_rules! make_setting_route { }; } -make_setting_route!( - "/filterable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, - >, - filterable_attributes, - "filterableAttributes", - FilterableAttributesAnalytics -); - -make_setting_route!( - "/sortable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, - >, - sortable_attributes, - "sortableAttributes", - SortableAttributesAnalytics -); - -make_setting_route!( - "/displayed-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, - >, - displayed_attributes, - "displayedAttributes", - DisplayedAttributesAnalytics -); - -make_setting_route!( - "/typo-tolerance", - patch, - meilisearch_types::settings::TypoSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, - >, - typo_tolerance, - "typoTolerance", - TypoToleranceAnalytics -); - -make_setting_route!( - "/searchable-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, - >, - searchable_attributes, - "searchableAttributes", - SearchableAttributesAnalytics -); - -make_setting_route!( - "/stop-words", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, - >, - stop_words, - "stopWords", - StopWordsAnalytics -); - -make_setting_route!( - "/non-separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, - >, - non_separator_tokens, - "nonSeparatorTokens", - NonSeparatorTokensAnalytics -); - -make_setting_route!( - "/separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, - >, - separator_tokens, - "separatorTokens", - SeparatorTokensAnalytics -); - -make_setting_route!( - "/dictionary", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, - >, - dictionary, - "dictionary", - DictionaryAnalytics -); - -make_setting_route!( - "/synonyms", - put, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, - >, - synonyms, - "synonyms", - SynonymsAnalytics -); - -make_setting_route!( - "/distinct-attribute", - put, - String, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, - >, - distinct_attribute, - "distinctAttribute", - DistinctAttributeAnalytics -); - -make_setting_route!( - "/proximity-precision", - put, - meilisearch_types::settings::ProximityPrecisionView, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, - >, - proximity_precision, - "proximityPrecision", - ProximityPrecisionAnalytics -); - -make_setting_route!( - "/localized-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, - >, - localized_attributes, - "localizedAttributes", - LocalesAnalytics -); - -make_setting_route!( - "/ranking-rules", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, - >, - ranking_rules, - "rankingRules", - RankingRulesAnalytics -); - -make_setting_route!( - "/faceting", - patch, - meilisearch_types::settings::FacetingSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, - >, - faceting, - "faceting", - FacetingAnalytics -); - -make_setting_route!( - "/pagination", - patch, - meilisearch_types::settings::PaginationSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsPagination, - >, - pagination, - "pagination", - PaginationAnalytics -); - -make_setting_route!( - "/embedders", - patch, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, - >, - embedders, - "embedders", - EmbeddersAnalytics -); - -make_setting_route!( - "/search-cutoff-ms", - put, - u64, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, - >, - search_cutoff_ms, - "searchCutoffMs", - SearchCutoffMsAnalytics -); - -make_setting_route!( - "/facet-search", - put, - bool, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, - >, - facet_search, - "facetSearch", - FacetSearchAnalytics -); - -make_setting_route!( - "/prefix-search", - put, - meilisearch_types::settings::PrefixSearchSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, - >, - prefix_search, - "prefixSearch", - PrefixSearchAnalytics -); - -macro_rules! generate_configure { - ($($mod:ident),*) => { - pub fn configure(cfg: &mut web::ServiceConfig) { - use crate::extractors::sequential_extractor::SeqHandler; - cfg.service( - web::resource("") - .route(web::patch().to(SeqHandler(update_all))) - .route(web::get().to(SeqHandler(get_all))) - .route(web::delete().to(SeqHandler(delete_all)))) - $(.service($mod::resources()))*; - } - }; -} - -generate_configure!( - filterable_attributes, - sortable_attributes, - displayed_attributes, - localized_attributes, - searchable_attributes, - distinct_attribute, - proximity_precision, - stop_words, - separator_tokens, - non_separator_tokens, - dictionary, - synonyms, - ranking_rules, - typo_tolerance, - pagination, - faceting, - embedders, - search_cutoff_ms +make_setting_routes!( + { + "/filterable-attributes", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, + >, + filterable_attributes, + "filterableAttributes", + FilterableAttributesAnalytics + }, + { + "/sortable-attributes", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, + >, + sortable_attributes, + "sortableAttributes", + SortableAttributesAnalytics + }, + { + "/displayed-attributes", + put, + Vec, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, + >, + displayed_attributes, + "displayedAttributes", + DisplayedAttributesAnalytics + }, + { + "/typo-tolerance", + patch, + meilisearch_types::settings::TypoSettings, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, + >, + typo_tolerance, + "typoTolerance", + TypoToleranceAnalytics + }, + { + "/searchable-attributes", + put, + Vec, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, + >, + searchable_attributes, + "searchableAttributes", + SearchableAttributesAnalytics + }, + { + "/stop-words", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, + >, + stop_words, + "stopWords", + StopWordsAnalytics + }, + { + "/non-separator-tokens", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, + >, + non_separator_tokens, + "nonSeparatorTokens", + NonSeparatorTokensAnalytics + }, + { + "/separator-tokens", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, + >, + separator_tokens, + "separatorTokens", + SeparatorTokensAnalytics + }, + { + "/dictionary", + put, + std::collections::BTreeSet, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, + >, + dictionary, + "dictionary", + DictionaryAnalytics + }, + { + "/synonyms", + put, + std::collections::BTreeMap>, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, + >, + synonyms, + "synonyms", + SynonymsAnalytics + }, + { + "/distinct-attribute", + put, + String, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, + >, + distinct_attribute, + "distinctAttribute", + DistinctAttributeAnalytics + }, + { + "/proximity-precision", + put, + meilisearch_types::settings::ProximityPrecisionView, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, + >, + proximity_precision, + "proximityPrecision", + ProximityPrecisionAnalytics + }, + { + "/localized-attributes", + put, + Vec, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, + >, + localized_attributes, + "localizedAttributes", + LocalesAnalytics + }, + { + "/ranking-rules", + put, + Vec, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, + >, + ranking_rules, + "rankingRules", + RankingRulesAnalytics + }, + { + "/faceting", + patch, + meilisearch_types::settings::FacetingSettings, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, + >, + faceting, + "faceting", + FacetingAnalytics + }, + { + "/pagination", + patch, + meilisearch_types::settings::PaginationSettings, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPagination, + >, + pagination, + "pagination", + PaginationAnalytics + }, + { + "/embedders", + patch, + std::collections::BTreeMap>, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, + >, + embedders, + "embedders", + EmbeddersAnalytics + }, + { + "/search-cutoff-ms", + put, + u64, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, + >, + search_cutoff_ms, + "searchCutoffMs", + SearchCutoffMsAnalytics + }, + { + "/facet-search", + put, + bool, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, + >, + facet_search, + "facetSearch", + FacetSearchAnalytics + }, + { + "/prefix-search", + put, + meilisearch_types::settings::PrefixSearchSettings, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, + >, + prefix_search, + "prefixSearch", + PrefixSearchAnalytics + } ); pub async fn update_all( diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 1b1964680..bb1aa861d 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -37,6 +37,23 @@ static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(| }), ); map.insert("search_cutoff_ms", json!(null)); + map.insert("embedders", json!(null)); + map.insert("facet_search", json!(true)); + map.insert("prefix_search", json!("indexingTime")); + map.insert("proximity_precision", json!("byWord")); + map.insert("sortable_attributes", json!([])); + map.insert( + "typo_tolerance", + json!({ + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }), + ); map }); @@ -343,7 +360,7 @@ async fn error_update_setting_unexisting_index_invalid_uid() { } macro_rules! test_setting_routes { - ($($setting:ident $write_method:ident), *) => { + ($($setting:ident $write_method:ident,) *) => { $( mod $setting { use crate::common::Server; @@ -409,6 +426,14 @@ macro_rules! test_setting_routes { } } )* + + #[actix_rt::test] + async fn all_setting_tested() { + let expected = std::collections::BTreeSet::from_iter(meilisearch::routes::indexes::settings::ALL_SETTINGS_NAMES.iter()); + let tested = std::collections::BTreeSet::from_iter([$(stringify!($setting)),*].iter()); + let diff: Vec<_> = expected.difference(&tested).collect(); + assert!(diff.is_empty(), "Not all settings were tested, please add the following settings to the `test_setting_routes!` macro: {:?}", diff); + } }; } @@ -426,7 +451,13 @@ test_setting_routes!( synonyms put, pagination patch, faceting patch, - search_cutoff_ms put + search_cutoff_ms put, + embedders patch, + facet_search put, + prefix_search put, + proximity_precision put, + sortable_attributes put, + typo_tolerance patch, ); #[actix_rt::test] From 9f36ffcbdb2e09799987f9da93660b4ab27d2bcb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Nov 2024 11:44:09 +0100 Subject: [PATCH 036/158] Polish make_setting_routes! --- .../src/routes/indexes/settings.rs | 284 +++++++++--------- 1 file changed, 142 insertions(+), 142 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index e08047d83..bb24fc880 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -18,7 +18,7 @@ use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; macro_rules! make_setting_routes { - ($({$route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident}),*) => { + ($({route: $route:literal, update_verb: $update_verb:ident, value_type: $type:ty, err_type: $err_ty:ty, attr: $attr:ident, camelcase_attr: $camelcase_attr:literal, analytics: $analytics:ident},)*) => { $( make_setting_route!($route, $update_verb, $type, $err_ty, $attr, $camelcase_attr, $analytics); )* @@ -175,225 +175,225 @@ macro_rules! make_setting_route { make_setting_routes!( { - "/filterable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/filterable-attributes", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, >, - filterable_attributes, - "filterableAttributes", - FilterableAttributesAnalytics + attr: filterable_attributes, + camelcase_attr: "filterableAttributes", + analytics: FilterableAttributesAnalytics }, { - "/sortable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/sortable-attributes", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, >, - sortable_attributes, - "sortableAttributes", - SortableAttributesAnalytics + attr: sortable_attributes, + camelcase_attr: "sortableAttributes", + analytics: SortableAttributesAnalytics }, { - "/displayed-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< + route: "/displayed-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, >, - displayed_attributes, - "displayedAttributes", - DisplayedAttributesAnalytics + attr: displayed_attributes, + camelcase_attr: "displayedAttributes", + analytics: DisplayedAttributesAnalytics }, { - "/typo-tolerance", - patch, - meilisearch_types::settings::TypoSettings, - meilisearch_types::deserr::DeserrJsonError< + route: "/typo-tolerance", + update_verb: patch, + value_type: meilisearch_types::settings::TypoSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, >, - typo_tolerance, - "typoTolerance", - TypoToleranceAnalytics + attr: typo_tolerance, + camelcase_attr: "typoTolerance", + analytics: TypoToleranceAnalytics }, { - "/searchable-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< + route: "/searchable-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, >, - searchable_attributes, - "searchableAttributes", - SearchableAttributesAnalytics + attr: searchable_attributes, + camelcase_attr: "searchableAttributes", + analytics: SearchableAttributesAnalytics }, { - "/stop-words", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/stop-words", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, >, - stop_words, - "stopWords", - StopWordsAnalytics + attr: stop_words, + camelcase_attr: "stopWords", + analytics: StopWordsAnalytics }, { - "/non-separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/non-separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, >, - non_separator_tokens, - "nonSeparatorTokens", - NonSeparatorTokensAnalytics + attr: non_separator_tokens, + camelcase_attr: "nonSeparatorTokens", + analytics: NonSeparatorTokensAnalytics }, { - "/separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, >, - separator_tokens, - "separatorTokens", - SeparatorTokensAnalytics + attr: separator_tokens, + camelcase_attr: "separatorTokens", + analytics: SeparatorTokensAnalytics }, { - "/dictionary", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< + route: "/dictionary", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, >, - dictionary, - "dictionary", - DictionaryAnalytics + attr: dictionary, + camelcase_attr: "dictionary", + analytics: DictionaryAnalytics }, { - "/synonyms", - put, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< + route: "/synonyms", + update_verb: put, + value_type: std::collections::BTreeMap>, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, >, - synonyms, - "synonyms", - SynonymsAnalytics + attr: synonyms, + camelcase_attr: "synonyms", + analytics: SynonymsAnalytics }, { - "/distinct-attribute", - put, - String, - meilisearch_types::deserr::DeserrJsonError< + route: "/distinct-attribute", + update_verb: put, + value_type: String, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, >, - distinct_attribute, - "distinctAttribute", - DistinctAttributeAnalytics + attr: distinct_attribute, + camelcase_attr: "distinctAttribute", + analytics: DistinctAttributeAnalytics }, { - "/proximity-precision", - put, - meilisearch_types::settings::ProximityPrecisionView, - meilisearch_types::deserr::DeserrJsonError< + route: "/proximity-precision", + update_verb: put, + value_type: meilisearch_types::settings::ProximityPrecisionView, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, >, - proximity_precision, - "proximityPrecision", - ProximityPrecisionAnalytics + attr: proximity_precision, + camelcase_attr: "proximityPrecision", + analytics: ProximityPrecisionAnalytics }, { - "/localized-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< + route: "/localized-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, >, - localized_attributes, - "localizedAttributes", - LocalesAnalytics + attr: localized_attributes, + camelcase_attr: "localizedAttributes", + analytics: LocalesAnalytics }, { - "/ranking-rules", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< + route: "/ranking-rules", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, >, - ranking_rules, - "rankingRules", - RankingRulesAnalytics + attr: ranking_rules, + camelcase_attr: "rankingRules", + analytics: RankingRulesAnalytics }, { - "/faceting", - patch, - meilisearch_types::settings::FacetingSettings, - meilisearch_types::deserr::DeserrJsonError< + route: "/faceting", + update_verb: patch, + value_type: meilisearch_types::settings::FacetingSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, >, - faceting, - "faceting", - FacetingAnalytics + attr: faceting, + camelcase_attr: "faceting", + analytics: FacetingAnalytics }, { - "/pagination", - patch, - meilisearch_types::settings::PaginationSettings, - meilisearch_types::deserr::DeserrJsonError< + route: "/pagination", + update_verb: patch, + value_type: meilisearch_types::settings::PaginationSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsPagination, >, - pagination, - "pagination", - PaginationAnalytics + attr: pagination, + camelcase_attr: "pagination", + analytics: PaginationAnalytics }, { - "/embedders", - patch, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< + route: "/embedders", + update_verb: patch, + value_type: std::collections::BTreeMap>, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, >, - embedders, - "embedders", - EmbeddersAnalytics + attr: embedders, + camelcase_attr: "embedders", + analytics: EmbeddersAnalytics }, { - "/search-cutoff-ms", - put, - u64, - meilisearch_types::deserr::DeserrJsonError< + route: "/search-cutoff-ms", + update_verb: put, + value_type: u64, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, >, - search_cutoff_ms, - "searchCutoffMs", - SearchCutoffMsAnalytics + attr: search_cutoff_ms, + camelcase_attr: "searchCutoffMs", + analytics: SearchCutoffMsAnalytics }, { - "/facet-search", - put, - bool, - meilisearch_types::deserr::DeserrJsonError< + route: "/facet-search", + update_verb: put, + value_type: bool, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, >, - facet_search, - "facetSearch", - FacetSearchAnalytics + attr: facet_search, + camelcase_attr: "facetSearch", + analytics: FacetSearchAnalytics }, { - "/prefix-search", - put, - meilisearch_types::settings::PrefixSearchSettings, - meilisearch_types::deserr::DeserrJsonError< + route: "/prefix-search", + update_verb: put, + value_type: meilisearch_types::settings::PrefixSearchSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, >, - prefix_search, - "prefixSearch", - PrefixSearchAnalytics - } + attr: prefix_search, + camelcase_attr: "prefixSearch", + analytics: PrefixSearchAnalytics + }, ); pub async fn update_all( From 58eab9a0182323ba4ce458d026726e7253a51917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 18:06:43 +0100 Subject: [PATCH 037/158] Send large payload through crossbeam --- crates/milli/src/update/new/channel.rs | 263 ++++++++++++++++++--- crates/milli/src/update/new/indexer/mod.rs | 39 ++- 2 files changed, 266 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 26e375a5a..7eaa50df1 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,4 +1,5 @@ use std::cell::RefCell; +use std::io::{self, BufWriter}; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; @@ -9,7 +10,7 @@ use bytemuck::{checked, CheckedBitPattern, NoUninit}; use crossbeam_channel::SendError; use heed::types::Bytes; use heed::BytesDecode; -use memmap2::Mmap; +use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; use super::extract::FacetKind; @@ -98,20 +99,63 @@ pub struct WriterBbqueueReceiver<'a> { pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, - /// An entry that cannot fit in the BBQueue buffers has been - /// written to disk, memory-mapped and must be written in the - /// database. - LargeEntry { - /// The database where the entry must be written. - database: Database, - /// The key of the entry that must be written in the database. - key: Box<[u8]>, - /// The large value that must be written. - /// - /// Note: We can probably use a `File` here and - /// use `Database::put_reserved` instead of memory-mapping. - value: Mmap, - }, + LargeEntry(LargeEntry), + LargeVector(LargeVector), + LargeVectors(LargeVectors), +} + +/// An entry that cannot fit in the BBQueue buffers has been +/// written to disk, memory-mapped and must be written in the +/// database. +#[derive(Debug)] +pub struct LargeEntry { + /// The database where the entry must be written. + pub database: Database, + /// The key of the entry that must be written in the database. + pub key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + pub value: Mmap, +} + +/// When an embedding is larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVector { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The large embedding that must be written. + pub embedding: Mmap, +} + +impl LargeVector { + pub fn read_embedding(&self) -> &[f32] { + bytemuck::cast_slice(&self.embedding) + } +} + +/// When embeddings are larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVectors { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The dimensions of the embeddings in this payload. + pub dimensions: u16, + /// The large embedding that must be written. + pub embeddings: Mmap, +} + +impl LargeVectors { + pub fn read_embeddings(&self) -> impl Iterator { + self.embeddings.chunks_exact(self.dimensions as usize).map(bytemuck::cast_slice) + } } impl<'a> WriterBbqueueReceiver<'a> { @@ -209,12 +253,55 @@ impl ArroySetVector { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub dimensions: u16, + pub embedder_id: u8, + _padding: u8, +} + +impl ArroySetVectors { + fn remaining_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + // /// The number of embeddings in this payload. + // pub fn embedding_count(&self, frame: &FrameGrantR<'_>) -> usize { + // let bytes = Self::remaining_bytes(frame); + // bytes.len().checked_div(self.dimensions as usize).unwrap() + // } + + /// Read the embedding at `index` or `None` if out of bounds. + pub fn read_embedding_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + index: usize, + vec: &'v mut Vec, + ) -> Option<&'v [f32]> { + vec.clear(); + let bytes = Self::remaining_bytes(frame); + let embedding_size = self.dimensions as usize * mem::size_of::(); + let embedding_bytes = bytes.chunks_exact(embedding_size).nth(index)?; + embedding_bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + Some(&vec[..]) + } +} + #[derive(Debug, Clone, Copy)] #[repr(u8)] pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), ArroySetVector(ArroySetVector), + ArroySetVectors(ArroySetVectors), } impl EntryHeader { @@ -227,6 +314,7 @@ impl EntryHeader { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroySetVector(_) => 2, + EntryHeader::ArroySetVectors(_) => 3, } } @@ -245,11 +333,15 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() } - /// The `embedding_length` corresponds to the number of `f32` in the embedding. - fn total_set_vector_size(embedding_length: usize) -> usize { - Self::variant_size() - + mem::size_of::() - + embedding_length * mem::size_of::() + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vector_size(dimensions: usize) -> usize { + Self::variant_size() + mem::size_of::() + dimensions * mem::size_of::() + } + + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size * count } fn header_size(&self) -> usize { @@ -257,6 +349,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), + EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), }; Self::variant_size() + payload_size } @@ -279,6 +372,11 @@ impl EntryHeader { let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVector(header) } + 3 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVectors(header) + } id => panic!("invalid variant id: {id}"), } } @@ -289,6 +387,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), + EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -405,7 +504,7 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); let total_length = EntryHeader::total_delete_vector_size(); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); } // Spin loop to have a frame the size we requested. @@ -441,11 +540,21 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let payload_header = - EntryHeader::ArroySetVector(ArroySetVector { docid, embedder_id, _padding: [0; 3] }); + let arroy_set_vector = ArroySetVector { docid, embedder_id, _padding: [0; 3] }; + let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); let total_length = EntryHeader::total_set_vector_size(embedding.len()); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + let mut embedding_bytes = bytemuck::cast_slice(embedding); + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + io::copy(&mut embedding_bytes, &mut value_file)?; + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + value_file.sync_all()?; + let embedding = unsafe { Mmap::map(&value_file)? }; + + let large_vector = LargeVector { docid, embedder_id, embedding }; + self.sender.send(ReceiverAction::LargeVector(large_vector)).unwrap(); + + return Ok(()); } // Spin loop to have a frame the size we requested. @@ -457,7 +566,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } }; - // payload_header.serialize_into(&mut grant); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); @@ -475,6 +583,83 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } + fn set_vectors( + &self, + docid: u32, + embedder_id: u8, + embeddings: &[Vec], + ) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let dimensions = match embeddings.first() { + Some(embedding) => embedding.len(), + None => return Ok(()), + }; + + let arroy_set_vector = ArroySetVectors { + docid, + dimensions: dimensions.try_into().unwrap(), + embedder_id, + _padding: 0, + }; + + let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); + let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); + if total_length > capacity { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + for embedding in embeddings { + let mut embedding_bytes = bytemuck::cast_slice(embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + } + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + value_file.sync_all()?; + let embeddings = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVectors { + docid, + embedder_id, + dimensions: dimensions.try_into().unwrap(), + embeddings, + }; + + self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); + + return Ok(()); + } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + + // We could commit only the used memory. + grant.commit(total_length); + + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + + Ok(()) + } + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { @@ -502,7 +687,22 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::DbOperation(operation); let total_length = EntryHeader::total_key_value_size(key_length, value_length); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + let mut key_buffer = vec![0; key_length.get() as usize].into_boxed_slice(); + let value_file = tempfile::tempfile()?; + value_file.set_len(value_length.try_into().unwrap())?; + let mut mmap_mut = unsafe { MmapMut::map_mut(&value_file)? }; + + key_value_writer(&mut key_buffer, &mut mmap_mut)?; + + self.sender + .send(ReceiverAction::LargeEntry(LargeEntry { + database, + key: key_buffer, + value: mmap_mut.make_read_only()?, + })) + .unwrap(); + + return Ok(()); } // Spin loop to have a frame the size we requested. @@ -559,7 +759,7 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::DbOperation(operation); let total_length = EntryHeader::total_key_size(key_length); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); } // Spin loop to have a frame the size we requested. @@ -763,10 +963,7 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embeddings: Vec, ) -> crate::Result<()> { - for embedding in embeddings { - self.set_vector(docid, embedder_id, embedding)?; - } - Ok(()) + self.0.set_vectors(docid, embedder_id, &embeddings[..]) } pub fn set_vector( @@ -786,11 +983,11 @@ impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { self.0 .sender - .send(ReceiverAction::LargeEntry { + .send(ReceiverAction::LargeEntry(LargeEntry { database: Database::Main, key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), value, - }) + })) .map_err(|_| SendError(())) } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 3a4406aef..9ad7a8f0b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -16,6 +16,7 @@ use rand::SeedableRng as _; use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; +use {LargeEntry, LargeVector}; use super::channel::*; use super::extract::*; @@ -40,7 +41,7 @@ use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; use crate::{ Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, @@ -132,7 +133,8 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); let _entered = span.enter(); - extract(document_changes, + extract( + document_changes, &document_extractor, indexing_context, &mut extractor_allocs, @@ -416,7 +418,7 @@ where match action { ReceiverAction::WakeUp => (), - ReceiverAction::LargeEntry { database, key, value } => { + ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => { let database_name = database.database_name(); let database = database.database(index); if let Err(error) = database.put(wtxn, &key, &value) { @@ -428,6 +430,24 @@ where })); } } + ReceiverAction::LargeVector(large_vector) => { + let embedding = large_vector.read_embedding(); + let LargeVector { docid, embedder_id, .. } = large_vector; + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } + ReceiverAction::LargeVectors(large_vectors) => { + let LargeVectors { docid, embedder_id, .. } = large_vectors; + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + let mut embeddings = Embeddings::new(*dimensions); + for embedding in large_vectors.read_embeddings() { + embeddings.push(embedding.to_vec()).unwrap(); + } + } } // Every time the is a message in the channel we search @@ -582,6 +602,19 @@ fn write_from_bbqueue( writer.del_items(wtxn, *dimensions, docid)?; writer.add_item(wtxn, docid, embedding)?; } + EntryHeader::ArroySetVectors(asvs) => { + let ArroySetVectors { docid, embedder_id, .. } = asvs; + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + for index in 0.. { + match asvs.read_embedding_into_vec(frame, index, aligned_embedding) { + Some(embedding) => writer.add_item(wtxn, docid, embedding)?, + None => break, + } + } + } } } From 5383f41bba83f522a43c993e6c6261042d430232 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Nov 2024 11:55:38 +0100 Subject: [PATCH 038/158] Polish test_setting_routes! --- .../tests/settings/get_settings.rs | 187 ++++++++++-------- 1 file changed, 105 insertions(+), 82 deletions(-) diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index bb1aa861d..b9e10033a 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -1,62 +1,6 @@ -use std::collections::HashMap; - -use once_cell::sync::Lazy; - -use crate::common::{Server, Value}; +use crate::common::Server; use crate::json; -static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(|| { - let mut map = HashMap::new(); - map.insert("displayed_attributes", json!(["*"])); - map.insert("searchable_attributes", json!(["*"])); - map.insert("localized_attributes", json!(null)); - map.insert("filterable_attributes", json!([])); - map.insert("distinct_attribute", json!(null)); - map.insert( - "ranking_rules", - json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]), - ); - map.insert("stop_words", json!([])); - map.insert("non_separator_tokens", json!([])); - map.insert("separator_tokens", json!([])); - map.insert("dictionary", json!([])); - map.insert("synonyms", json!({})); - map.insert( - "faceting", - json!({ - "maxValuesPerFacet": json!(100), - "sortFacetValuesBy": { - "*": "alpha" - } - }), - ); - map.insert( - "pagination", - json!({ - "maxTotalHits": json!(1000), - }), - ); - map.insert("search_cutoff_ms", json!(null)); - map.insert("embedders", json!(null)); - map.insert("facet_search", json!(true)); - map.insert("prefix_search", json!("indexingTime")); - map.insert("proximity_precision", json!("byWord")); - map.insert("sortable_attributes", json!([])); - map.insert( - "typo_tolerance", - json!({ - "enabled": true, - "minWordSizeForTypos": { - "oneTypo": 5, - "twoTypos": 9 - }, - "disableOnWords": [], - "disableOnAttributes": [] - }), - ); - map -}); - #[actix_rt::test] async fn get_settings_unexisting_index() { let server = Server::new().await; @@ -360,11 +304,10 @@ async fn error_update_setting_unexisting_index_invalid_uid() { } macro_rules! test_setting_routes { - ($($setting:ident $write_method:ident,) *) => { + ($({setting: $setting:ident, update_verb: $update_verb:ident, default_value: $default_value:tt},) *) => { $( mod $setting { use crate::common::Server; - use super::DEFAULT_SETTINGS_VALUES; #[actix_rt::test] async fn get_unexisting_index() { @@ -386,7 +329,7 @@ macro_rules! test_setting_routes { .chars() .map(|c| if c == '_' { '-' } else { c }) .collect::()); - let (response, code) = server.service.$write_method(url, serde_json::Value::Null.into()).await; + let (response, code) = server.service.$update_verb(url, serde_json::Value::Null.into()).await; assert_eq!(code, 202, "{}", response); server.index("").wait_task(0).await; let (response, code) = server.index("test").get().await; @@ -421,8 +364,8 @@ macro_rules! test_setting_routes { .collect::()); let (response, code) = server.service.get(url).await; assert_eq!(code, 200, "{}", response); - let expected = DEFAULT_SETTINGS_VALUES.get(stringify!($setting)).unwrap(); - assert_eq!(expected, &response); + let expected = crate::json!($default_value); + assert_eq!(expected, response); } } )* @@ -438,26 +381,106 @@ macro_rules! test_setting_routes { } test_setting_routes!( - filterable_attributes put, - displayed_attributes put, - localized_attributes put, - searchable_attributes put, - distinct_attribute put, - stop_words put, - separator_tokens put, - non_separator_tokens put, - dictionary put, - ranking_rules put, - synonyms put, - pagination patch, - faceting patch, - search_cutoff_ms put, - embedders patch, - facet_search put, - prefix_search put, - proximity_precision put, - sortable_attributes put, - typo_tolerance patch, + { + setting: filterable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: displayed_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: localized_attributes, + update_verb: put, + default_value: null + }, + { + setting: searchable_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: distinct_attribute, + update_verb: put, + default_value: null + }, + { + setting: stop_words, + update_verb: put, + default_value: [] + }, + { + setting: separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: non_separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: dictionary, + update_verb: put, + default_value: [] + }, + { + setting: ranking_rules, + update_verb: put, + default_value: ["words", "typo", "proximity", "attribute", "sort", "exactness"] + }, + { + setting: synonyms, + update_verb: put, + default_value: {} + }, + { + setting: pagination, + update_verb: patch, + default_value: {"maxTotalHits": 1000} + }, + { + setting: faceting, + update_verb: patch, + default_value: {"maxValuesPerFacet": 100, "sortFacetValuesBy": {"*": "alpha"}} + }, + { + setting: search_cutoff_ms, + update_verb: put, + default_value: null + }, + { + setting: embedders, + update_verb: patch, + default_value: null + }, + { + setting: facet_search, + update_verb: put, + default_value: true + }, + { + setting: prefix_search, + update_verb: put, + default_value: "indexingTime" + }, + { + setting: proximity_precision, + update_verb: put, + default_value: "byWord" + }, + { + setting: sortable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: typo_tolerance, + update_verb: patch, + default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} + }, ); #[actix_rt::test] From cc4bd54669b64b6fa195616fb18ca7da38c299a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 13:53:25 +0100 Subject: [PATCH 039/158] Correctly construct the Embeddings struct --- crates/milli/src/update/new/channel.rs | 14 ++++++++++++++ crates/milli/src/update/new/indexer/mod.rs | 13 ++++++------- crates/milli/src/vector/mod.rs | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7eaa50df1..237c19a5c 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -293,6 +293,20 @@ impl ArroySetVectors { }); Some(&vec[..]) } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + vec.clear(); + Self::remaining_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + &vec[..] + } } #[derive(Debug, Clone, Copy)] diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 9ad7a8f0b..a8a94cb7c 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -442,11 +442,12 @@ where let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; let mut embeddings = Embeddings::new(*dimensions); for embedding in large_vectors.read_embeddings() { embeddings.push(embedding.to_vec()).unwrap(); } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; } } @@ -607,13 +608,11 @@ fn write_from_bbqueue( let frame = frame_with_header.frame(); let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let mut embeddings = Embeddings::new(*dimensions); + let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); + embeddings.append(all_embeddings.to_vec()).unwrap(); writer.del_items(wtxn, *dimensions, docid)?; - for index in 0.. { - match asvs.read_embedding_into_vec(frame, index, aligned_embedding) { - Some(embedding) => writer.add_item(wtxn, docid, embedding)?, - None => break, - } - } + writer.add_items(wtxn, docid, &embeddings)?; } } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3047e6dfc..a1d71ef93 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -475,7 +475,7 @@ impl Embeddings { Ok(()) } - /// Append a flat vector of embeddings a the end of the embeddings. + /// Append a flat vector of embeddings at the end of the embeddings. /// /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { From 3dc87f5baacc649483b30d76aab251a3b8ebed30 Mon Sep 17 00:00:00 2001 From: curquiza Date: Thu, 28 Nov 2024 14:33:05 +0100 Subject: [PATCH 040/158] Update mini-dashboard to v0.2.16 version --- crates/meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 2884f0c9c..4f357157e 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -157,5 +157,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" -sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.16/build.zip" +sha1 = "68f83438a114aabbe76bc9fe480071e741996662" From 096a28656ee3c1bba1900f2335e33a8a88677070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:15:06 +0100 Subject: [PATCH 041/158] Fix a bug around deleting all the vectors of a doc --- crates/milli/src/update/new/channel.rs | 68 ++++++--------------- crates/milli/src/update/new/indexer/mod.rs | 7 ++- crates/milli/src/update/new/ref_cell_ext.rs | 1 + 3 files changed, 23 insertions(+), 53 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 237c19a5c..38f436837 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -146,15 +146,13 @@ pub struct LargeVectors { pub docid: DocumentId, /// The embedder id in which to insert the large embedding. pub embedder_id: u8, - /// The dimensions of the embeddings in this payload. - pub dimensions: u16, /// The large embedding that must be written. pub embeddings: Mmap, } impl LargeVectors { - pub fn read_embeddings(&self) -> impl Iterator { - self.embeddings.chunks_exact(self.dimensions as usize).map(bytemuck::cast_slice) + pub fn read_embeddings(&self, dimensions: usize) -> impl Iterator { + self.embeddings.chunks_exact(dimensions).map(bytemuck::cast_slice) } } @@ -241,15 +239,18 @@ impl ArroySetVector { &self, frame: &FrameGrantR<'_>, vec: &'v mut Vec, - ) -> &'v [f32] { + ) -> Option<&'v [f32]> { vec.clear(); let skip = EntryHeader::variant_size() + mem::size_of::(); let bytes = &frame[skip..]; + if bytes.is_empty() { + return None; + } bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); vec.push(f); }); - &vec[..] + Some(&vec[..]) } } @@ -259,9 +260,8 @@ impl ArroySetVector { /// non-aligned [f32] each with dimensions f32s. pub struct ArroySetVectors { pub docid: DocumentId, - pub dimensions: u16, pub embedder_id: u8, - _padding: u8, + _padding: [u8; 3], } impl ArroySetVectors { @@ -270,30 +270,6 @@ impl ArroySetVectors { &frame[skip..] } - // /// The number of embeddings in this payload. - // pub fn embedding_count(&self, frame: &FrameGrantR<'_>) -> usize { - // let bytes = Self::remaining_bytes(frame); - // bytes.len().checked_div(self.dimensions as usize).unwrap() - // } - - /// Read the embedding at `index` or `None` if out of bounds. - pub fn read_embedding_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - index: usize, - vec: &'v mut Vec, - ) -> Option<&'v [f32]> { - vec.clear(); - let bytes = Self::remaining_bytes(frame); - let embedding_size = self.dimensions as usize * mem::size_of::(); - let embedding_bytes = bytes.chunks_exact(embedding_size).nth(index)?; - embedding_bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); - Some(&vec[..]) - } - /// Read all the embeddings and write them into an aligned `f32` Vec. pub fn read_all_embeddings_into_vec<'v>( &self, @@ -607,18 +583,14 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); + // If there are no vector we specify the dimensions + // to zero to allocate no extra space at all let dimensions = match embeddings.first() { Some(embedding) => embedding.len(), - None => return Ok(()), - }; - - let arroy_set_vector = ArroySetVectors { - docid, - dimensions: dimensions.try_into().unwrap(), - embedder_id, - _padding: 0, + None => 0, }; + let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); if total_length > capacity { @@ -632,13 +604,7 @@ impl<'b> ExtractorBbqueueSender<'b> { value_file.sync_all()?; let embeddings = unsafe { Mmap::map(&value_file)? }; - let large_vectors = LargeVectors { - docid, - embedder_id, - dimensions: dimensions.try_into().unwrap(), - embeddings, - }; - + let large_vectors = LargeVectors { docid, embedder_id, embeddings }; self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); return Ok(()); @@ -657,9 +623,11 @@ impl<'b> ExtractorBbqueueSender<'b> { let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); - let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); - for (embedding, output) in embeddings.iter().zip(output_iter) { - output.copy_from_slice(bytemuck::cast_slice(embedding)); + if dimensions != 0 { + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } } // We could commit only the used memory. diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index a8a94cb7c..07cb9d69e 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -443,7 +443,7 @@ where let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); - for embedding in large_vectors.read_embeddings() { + for embedding in large_vectors.read_embeddings(*dimensions) { embeddings.push(embedding.to_vec()).unwrap(); } writer.del_items(wtxn, *dimensions, docid)?; @@ -597,11 +597,12 @@ fn write_from_bbqueue( EntryHeader::ArroySetVector(asv) => { let ArroySetVector { docid, embedder_id, .. } = asv; let frame = frame_with_header.frame(); - let embedding = asv.read_embedding_into_vec(frame, aligned_embedding); let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; + if let Some(embedding) = asv.read_embedding_into_vec(frame, aligned_embedding) { + writer.add_item(wtxn, docid, embedding)?; + } } EntryHeader::ArroySetVectors(asvs) => { let ArroySetVectors { docid, embedder_id, .. } = asvs; diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs index c66f4af0a..77f5fa800 100644 --- a/crates/milli/src/update/new/ref_cell_ext.rs +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -5,6 +5,7 @@ pub trait RefCellExt { &self, ) -> std::result::Result, std::cell::BorrowMutError>; + #[track_caller] fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { self.try_borrow_mut_or_yield().unwrap() } From 90b428a8c3d5930133870cb14d5e950baed1a1ad Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Nov 2024 15:16:13 +0100 Subject: [PATCH 042/158] Apply change requests --- .../src/routes/indexes/settings.rs | 6 + .../tests/settings/get_settings.rs | 360 +++++++++--------- 2 files changed, 186 insertions(+), 180 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index bb24fc880..b2922e5ff 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -17,6 +17,12 @@ use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +/// This macro generates the routes for the settings. +/// +/// It takes a list of settings and generates a module for each setting. +/// Each module contains the `get`, `update` and `delete` routes for the setting. +/// +/// It also generates a `configure` function that configures the routes for the settings. macro_rules! make_setting_routes { ($({route: $route:literal, update_verb: $update_verb:ident, value_type: $type:ty, err_type: $err_ty:ty, attr: $attr:ident, camelcase_attr: $camelcase_attr:literal, analytics: $analytics:ident},)*) => { $( diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index b9e10033a..55d9441ee 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -1,6 +1,186 @@ use crate::common::Server; use crate::json; +macro_rules! test_setting_routes { + ($({setting: $setting:ident, update_verb: $update_verb:ident, default_value: $default_value:tt},) *) => { + $( + mod $setting { + use crate::common::Server; + + #[actix_rt::test] + async fn get_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_response, code) = server.service.get(url).await; + assert_eq!(code, 404); + } + + #[actix_rt::test] + async fn update_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.$update_verb(url, serde_json::Value::Null.into()).await; + assert_eq!(code, 202, "{}", response); + server.index("").wait_task(0).await; + let (response, code) = server.index("test").get().await; + assert_eq!(code, 200, "{}", response); + } + + #[actix_rt::test] + async fn delete_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_, code) = server.service.delete(url).await; + assert_eq!(code, 202); + let response = server.index("").wait_task(0).await; + assert_eq!(response["status"], "failed"); + } + + #[actix_rt::test] + async fn get_default() { + let server = Server::new().await; + let index = server.index("test"); + let (response, code) = index.create(None).await; + assert_eq!(code, 202, "{}", response); + index.wait_task(0).await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.get(url).await; + assert_eq!(code, 200, "{}", response); + let expected = crate::json!($default_value); + assert_eq!(expected, response); + } + } + )* + + #[actix_rt::test] + async fn all_setting_tested() { + let expected = std::collections::BTreeSet::from_iter(meilisearch::routes::indexes::settings::ALL_SETTINGS_NAMES.iter()); + let tested = std::collections::BTreeSet::from_iter([$(stringify!($setting)),*].iter()); + let diff: Vec<_> = expected.difference(&tested).collect(); + assert!(diff.is_empty(), "Not all settings were tested, please add the following settings to the `test_setting_routes!` macro: {:?}", diff); + } + }; +} + +test_setting_routes!( + { + setting: filterable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: displayed_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: localized_attributes, + update_verb: put, + default_value: null + }, + { + setting: searchable_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: distinct_attribute, + update_verb: put, + default_value: null + }, + { + setting: stop_words, + update_verb: put, + default_value: [] + }, + { + setting: separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: non_separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: dictionary, + update_verb: put, + default_value: [] + }, + { + setting: ranking_rules, + update_verb: put, + default_value: ["words", "typo", "proximity", "attribute", "sort", "exactness"] + }, + { + setting: synonyms, + update_verb: put, + default_value: {} + }, + { + setting: pagination, + update_verb: patch, + default_value: {"maxTotalHits": 1000} + }, + { + setting: faceting, + update_verb: patch, + default_value: {"maxValuesPerFacet": 100, "sortFacetValuesBy": {"*": "alpha"}} + }, + { + setting: search_cutoff_ms, + update_verb: put, + default_value: null + }, + { + setting: embedders, + update_verb: patch, + default_value: null + }, + { + setting: facet_search, + update_verb: put, + default_value: true + }, + { + setting: prefix_search, + update_verb: put, + default_value: "indexingTime" + }, + { + setting: proximity_precision, + update_verb: put, + default_value: "byWord" + }, + { + setting: sortable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: typo_tolerance, + update_verb: patch, + default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} + }, +); + #[actix_rt::test] async fn get_settings_unexisting_index() { let server = Server::new().await; @@ -303,186 +483,6 @@ async fn error_update_setting_unexisting_index_invalid_uid() { "###); } -macro_rules! test_setting_routes { - ($({setting: $setting:ident, update_verb: $update_verb:ident, default_value: $default_value:tt},) *) => { - $( - mod $setting { - use crate::common::Server; - - #[actix_rt::test] - async fn get_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_response, code) = server.service.get(url).await; - assert_eq!(code, 404); - } - - #[actix_rt::test] - async fn update_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.$update_verb(url, serde_json::Value::Null.into()).await; - assert_eq!(code, 202, "{}", response); - server.index("").wait_task(0).await; - let (response, code) = server.index("test").get().await; - assert_eq!(code, 200, "{}", response); - } - - #[actix_rt::test] - async fn delete_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_, code) = server.service.delete(url).await; - assert_eq!(code, 202); - let response = server.index("").wait_task(0).await; - assert_eq!(response["status"], "failed"); - } - - #[actix_rt::test] - async fn get_default() { - let server = Server::new().await; - let index = server.index("test"); - let (response, code) = index.create(None).await; - assert_eq!(code, 202, "{}", response); - index.wait_task(0).await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.get(url).await; - assert_eq!(code, 200, "{}", response); - let expected = crate::json!($default_value); - assert_eq!(expected, response); - } - } - )* - - #[actix_rt::test] - async fn all_setting_tested() { - let expected = std::collections::BTreeSet::from_iter(meilisearch::routes::indexes::settings::ALL_SETTINGS_NAMES.iter()); - let tested = std::collections::BTreeSet::from_iter([$(stringify!($setting)),*].iter()); - let diff: Vec<_> = expected.difference(&tested).collect(); - assert!(diff.is_empty(), "Not all settings were tested, please add the following settings to the `test_setting_routes!` macro: {:?}", diff); - } - }; -} - -test_setting_routes!( - { - setting: filterable_attributes, - update_verb: put, - default_value: [] - }, - { - setting: displayed_attributes, - update_verb: put, - default_value: ["*"] - }, - { - setting: localized_attributes, - update_verb: put, - default_value: null - }, - { - setting: searchable_attributes, - update_verb: put, - default_value: ["*"] - }, - { - setting: distinct_attribute, - update_verb: put, - default_value: null - }, - { - setting: stop_words, - update_verb: put, - default_value: [] - }, - { - setting: separator_tokens, - update_verb: put, - default_value: [] - }, - { - setting: non_separator_tokens, - update_verb: put, - default_value: [] - }, - { - setting: dictionary, - update_verb: put, - default_value: [] - }, - { - setting: ranking_rules, - update_verb: put, - default_value: ["words", "typo", "proximity", "attribute", "sort", "exactness"] - }, - { - setting: synonyms, - update_verb: put, - default_value: {} - }, - { - setting: pagination, - update_verb: patch, - default_value: {"maxTotalHits": 1000} - }, - { - setting: faceting, - update_verb: patch, - default_value: {"maxValuesPerFacet": 100, "sortFacetValuesBy": {"*": "alpha"}} - }, - { - setting: search_cutoff_ms, - update_verb: put, - default_value: null - }, - { - setting: embedders, - update_verb: patch, - default_value: null - }, - { - setting: facet_search, - update_verb: put, - default_value: true - }, - { - setting: prefix_search, - update_verb: put, - default_value: "indexingTime" - }, - { - setting: proximity_precision, - update_verb: put, - default_value: "byWord" - }, - { - setting: sortable_attributes, - update_verb: put, - default_value: [] - }, - { - setting: typo_tolerance, - update_verb: patch, - default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} - }, -); - #[actix_rt::test] async fn error_set_invalid_ranking_rules() { let server = Server::new().await; From b57dd5c58e2944bb607681a4adfcf0b05dd25b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:19:57 +0100 Subject: [PATCH 043/158] Remove the Vector variant and use the Vectors --- crates/milli/src/update/new/channel.rs | 126 +-------------------- crates/milli/src/update/new/indexer/mod.rs | 19 ---- 2 files changed, 4 insertions(+), 141 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 38f436837..102a27336 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -100,7 +100,6 @@ pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, LargeEntry(LargeEntry), - LargeVector(LargeVector), LargeVectors(LargeVectors), } @@ -120,24 +119,6 @@ pub struct LargeEntry { pub value: Mmap, } -/// When an embedding is larger than the available -/// BBQueue space it arrives here. -#[derive(Debug)] -pub struct LargeVector { - /// The document id associated to the large embedding. - pub docid: DocumentId, - /// The embedder id in which to insert the large embedding. - pub embedder_id: u8, - /// The large embedding that must be written. - pub embedding: Mmap, -} - -impl LargeVector { - pub fn read_embedding(&self) -> &[f32] { - bytemuck::cast_slice(&self.embedding) - } -} - /// When embeddings are larger than the available /// BBQueue space it arrives here. #[derive(Debug)] @@ -225,35 +206,6 @@ pub struct ArroyDeleteVector { pub docid: DocumentId, } -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// The embedding is the remaining space and represents a non-aligned [f32]. -pub struct ArroySetVector { - pub docid: DocumentId, - pub embedder_id: u8, - _padding: [u8; 3], -} - -impl ArroySetVector { - pub fn read_embedding_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - vec: &'v mut Vec, - ) -> Option<&'v [f32]> { - vec.clear(); - let skip = EntryHeader::variant_size() + mem::size_of::(); - let bytes = &frame[skip..]; - if bytes.is_empty() { - return None; - } - bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); - Some(&vec[..]) - } -} - #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(C)] /// The embeddings are in the remaining space and represents @@ -290,7 +242,6 @@ impl ArroySetVectors { pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), - ArroySetVector(ArroySetVector), ArroySetVectors(ArroySetVectors), } @@ -303,8 +254,7 @@ impl EntryHeader { match self { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, - EntryHeader::ArroySetVector(_) => 2, - EntryHeader::ArroySetVectors(_) => 3, + EntryHeader::ArroySetVectors(_) => 2, } } @@ -323,11 +273,6 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() } - /// The `dimensions` corresponds to the number of `f32` in the embedding. - fn total_set_vector_size(dimensions: usize) -> usize { - Self::variant_size() + mem::size_of::() + dimensions * mem::size_of::() - } - /// The `dimensions` corresponds to the number of `f32` in the embedding. fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { let embedding_size = dimensions * mem::size_of::(); @@ -338,7 +283,6 @@ impl EntryHeader { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), - EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), }; Self::variant_size() + payload_size @@ -358,11 +302,6 @@ impl EntryHeader { EntryHeader::ArroyDeleteVector(header) } 2 => { - let header_bytes = &remaining[..mem::size_of::()]; - let header = checked::pod_read_unaligned(header_bytes); - EntryHeader::ArroySetVector(header) - } - 3 => { let header_bytes = &remaining[..mem::size_of::()]; let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVectors(header) @@ -376,7 +315,6 @@ impl EntryHeader { let payload_bytes = match self { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), - EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), }; *first = self.variant_id(); @@ -520,59 +458,6 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } - fn set_vector( - &self, - docid: DocumentId, - embedder_id: u8, - embedding: &[f32], - ) -> crate::Result<()> { - let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let arroy_set_vector = ArroySetVector { docid, embedder_id, _padding: [0; 3] }; - let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); - let total_length = EntryHeader::total_set_vector_size(embedding.len()); - if total_length > capacity { - let mut embedding_bytes = bytemuck::cast_slice(embedding); - let mut value_file = tempfile::tempfile().map(BufWriter::new)?; - io::copy(&mut embedding_bytes, &mut value_file)?; - let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; - value_file.sync_all()?; - let embedding = unsafe { Mmap::map(&value_file)? }; - - let large_vector = LargeVector { docid, embedder_id, embedding }; - self.sender.send(ReceiverAction::LargeVector(large_vector)).unwrap(); - - return Ok(()); - } - - // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; - - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - remaining.copy_from_slice(bytemuck::cast_slice(embedding)); - - // We could commit only the used memory. - grant.commit(total_length); - - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - - Ok(()) - } - fn set_vectors( &self, docid: u32, @@ -583,12 +468,9 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - // If there are no vector we specify the dimensions + // If there are no vectors we specify the dimensions // to zero to allocate no extra space at all - let dimensions = match embeddings.first() { - Some(embedding) => embedding.len(), - None => 0, - }; + let dimensions = embeddings.first().map_or(0, |emb| emb.len()); let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); @@ -954,7 +836,7 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embedding: Embedding, ) -> crate::Result<()> { - self.0.set_vector(docid, embedder_id, &embedding[..]) + self.0.set_vectors(docid, embedder_id, &[embedding]) } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 07cb9d69e..9a6b40efb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -16,7 +16,6 @@ use rand::SeedableRng as _; use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; -use {LargeEntry, LargeVector}; use super::channel::*; use super::extract::*; @@ -430,14 +429,6 @@ where })); } } - ReceiverAction::LargeVector(large_vector) => { - let embedding = large_vector.read_embedding(); - let LargeVector { docid, embedder_id, .. } = large_vector; - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; - } ReceiverAction::LargeVectors(large_vectors) => { let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = @@ -594,16 +585,6 @@ fn write_from_bbqueue( writer.del_items(wtxn, dimensions, docid)?; } } - EntryHeader::ArroySetVector(asv) => { - let ArroySetVector { docid, embedder_id, .. } = asv; - let frame = frame_with_header.frame(); - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - if let Some(embedding) = asv.read_embedding_into_vec(frame, aligned_embedding) { - writer.add_item(wtxn, docid, embedding)?; - } - } EntryHeader::ArroySetVectors(asvs) => { let ArroySetVectors { docid, embedder_id, .. } = asvs; let frame = frame_with_header.frame(); From 3c7ac093d39a6fa08eaf5e34814ba967037e80ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:43:14 +0100 Subject: [PATCH 044/158] Take the BBQueue capacity into account in the max memory --- crates/milli/src/update/new/channel.rs | 11 +++++++---- crates/milli/src/update/new/indexer/mod.rs | 23 ++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 102a27336..1a463be1e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -27,8 +27,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// -/// The `bbqueue_capacity` represent the number of bytes allocated -/// to each BBQueue buffer and is not the sum of all of them. +/// The `total_bbbuffer_capacity` represent the number of bytes +/// allocated to all BBQueue buffer. It will be split by the +/// number of thread. /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through @@ -46,10 +47,12 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &mut Vec, - bbbuffer_capacity: usize, + total_bbbuffer_capacity: usize, channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { - bbbuffers.resize_with(rayon::current_num_threads(), || BBBuffer::new(bbbuffer_capacity)); + let current_num_threads = rayon::current_num_threads(); + let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap(); + bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity)); let capacity = bbbuffers.first().unwrap().capacity(); // Read the field description to understand this diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 9a6b40efb..99ee89701 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -79,15 +79,22 @@ where { let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); + + // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. + let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( + (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default + |max_memory| { + let total_bbbuffer_capacity = max_memory / 10; // 10% of the indexing memory + let new_grenad_parameters = GrenadParameters { + max_memory: Some(max_memory - total_bbbuffer_capacity), + ..grenad_parameters + }; + (new_grenad_parameters, total_bbbuffer_capacity) + }, + ); + let (extractor_sender, mut writer_receiver) = pool - .install(|| { - /// TODO restrict memory and remove this memory from the extractors bump allocators - extractor_writer_bbqueue( - &mut bbbuffers, - 100 * 1024 * 1024, // 100 MiB - 1000, - ) - }) + .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) .unwrap(); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; From 8a35cd1743ec4ce9e8b872bbd9bb0ede4aaad35d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 16:00:15 +0100 Subject: [PATCH 045/158] Adjust the BBQueue buffers to use 2% instead of 10% --- crates/milli/src/update/new/indexer/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 99ee89701..19f1bca3e 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -84,7 +84,7 @@ where let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default |max_memory| { - let total_bbbuffer_capacity = max_memory / 10; // 10% of the indexing memory + let total_bbbuffer_capacity = max_memory / (100 / 2); // 2% of the indexing memory let new_grenad_parameters = GrenadParameters { max_memory: Some(max_memory - total_bbbuffer_capacity), ..grenad_parameters From 14ee7aa84c7fc82e6475f551b1fc9d2b4f8aaff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 18:02:48 +0100 Subject: [PATCH 046/158] Make sure the BBQueue is at least 50 MiB --- crates/milli/src/update/new/indexer/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 19f1bca3e..e0450ff7d 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -81,10 +81,12 @@ where let finished_extraction = AtomicBool::new(false); // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. + let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( - (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default + (grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default |max_memory| { - let total_bbbuffer_capacity = max_memory / (100 / 2); // 2% of the indexing memory + // 2% of the indexing memory + let total_bbbuffer_capacity = (max_memory / 100 / 2).min(minimum_capacity); let new_grenad_parameters = GrenadParameters { max_memory: Some(max_memory - total_bbbuffer_capacity), ..grenad_parameters From 13f21206a64de13202cec3c2841a8c3654b6899a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:03:01 +0100 Subject: [PATCH 047/158] Call the serialize_into_writer method from the serialize_into one --- .../roaring_bitmap/cbo_roaring_bitmap_codec.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index cae1874dd..20a246dcd 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -27,18 +27,8 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { - if roaring.len() <= THRESHOLD as u64 { - // If the number of items (u32s) to encode is less than or equal to the threshold - // it means that it would weigh the same or less than the RoaringBitmap - // header, so we directly encode them using ByteOrder instead. - for integer in roaring { - vec.write_u32::(integer).unwrap(); - } - } else { - // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec).unwrap(); - } + pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec) { + Self::serialize_into_writer(roaring, vec).unwrap() } pub fn serialize_into_writer( From db4eaf4d2de4140fde57ddfd71af80f8a4ed4826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:03:27 +0100 Subject: [PATCH 048/158] Rename serialize_into into serialize_into_writer --- crates/milli/src/heed_codec/facet/mod.rs | 2 +- .../heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 4 ++-- crates/milli/src/update/new/extract/cache.rs | 8 ++++---- crates/milli/src/update/new/words_prefix_docids.rs | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/heed_codec/facet/mod.rs b/crates/milli/src/heed_codec/facet/mod.rs index a8bb5055e..c0870c9fd 100644 --- a/crates/milli/src/heed_codec/facet/mod.rs +++ b/crates/milli/src/heed_codec/facet/mod.rs @@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![value.size]; - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v); Ok(Cow::Owned(v)) } } diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 20a246dcd..0ab162880 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -152,7 +152,7 @@ impl CboRoaringBitmapCodec { return Ok(None); } - Self::serialize_into(&previous, buffer); + Self::serialize_into_vec(&previous, buffer); Ok(Some(&buffer[..])) } } @@ -178,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec); + Self::serialize_into_vec(item, &mut vec); Ok(Cow::Owned(vec)) } } diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 26ed0eb44..be077d142 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -415,21 +415,21 @@ fn spill_entry_to_sorter( match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 338d22505..7e56beeae 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -76,7 +76,7 @@ impl WordPrefixDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); file.write_all(buffer) })?; @@ -211,7 +211,7 @@ impl WordPrefixIntegerDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); file.write_all(buffer)?; } From 76d0623b11b88c169843bbc61c1b8bff132e9d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:05:06 +0100 Subject: [PATCH 049/158] Reduce the number of unwraps --- crates/milli/src/update/new/merger.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index f8af84177..b650b6b53 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -56,7 +56,7 @@ where let rtree_mmap = unsafe { Mmap::map(&file)? }; geo_sender.set_rtree(rtree_mmap).unwrap(); - geo_sender.set_geo_faceted(&faceted).unwrap(); + geo_sender.set_geo_faceted(&faceted)?; Ok(()) } @@ -82,11 +82,11 @@ where let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - docids_sender.write(key, &bitmap).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -112,12 +112,12 @@ pub fn merge_and_send_facet_docids<'extractor>( match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - docids_sender.write(key, &bitmap).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { facet_field_ids_delta.register_from_key(key); - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), From 5b860cb9893ded811150f9ae0332dc89f166ea6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:06:35 +0100 Subject: [PATCH 050/158] Fix english in the doc --- crates/milli/src/update/new/channel.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 1a463be1e..7375354aa 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -27,9 +27,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// -/// The `total_bbbuffer_capacity` represent the number of bytes -/// allocated to all BBQueue buffer. It will be split by the -/// number of thread. +/// The `total_bbbuffer_capacity` represents the number of bytes +/// allocated to all BBQueue buffers. It will be split by the +/// number of threads. /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through @@ -37,14 +37,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// sure we do not use too much memory. /// /// Note that the channel is also used to wake-up the receiver -/// wehn new stuff is available in any BBQueue buffer but we send +/// when new stuff is available in any BBQueue buffer but we send /// a message in this queue only if it is empty to avoid filling /// the channel *and* the BBQueue. -/// -/// # Safety -/// -/// Panics if the number of provided BBQueues is not exactly equal -/// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &mut Vec, total_bbbuffer_capacity: usize, @@ -82,7 +77,7 @@ pub struct ExtractorBbqueueSender<'a> { /// The capacity of this frame producer, will never be able to store more than that. /// /// Note that the FrameProducer requires up to 9 bytes to encode the length, - /// the capacity has been shrinked accordingly. + /// the capacity has been shrunk accordingly. /// /// capacity: usize, From 30eb0e5b5baad02475a73c5ae16f3a1713bd21a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:08:01 +0100 Subject: [PATCH 051/158] Rename recv and read methods to recv_action and recv_frame --- crates/milli/src/update/new/channel.rs | 4 ++-- crates/milli/src/update/new/indexer/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7375354aa..82e483d18 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -136,11 +136,11 @@ impl LargeVectors { } impl<'a> WriterBbqueueReceiver<'a> { - pub fn recv(&mut self) -> Option { + pub fn recv_action(&mut self) -> Option { self.receiver.recv().ok() } - pub fn read(&mut self) -> Option> { + pub fn recv_frame(&mut self) -> Option> { for consumer in &mut self.consumers { if let Some(frame) = consumer.read() { return Some(FrameWithHeader::from(frame)); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e0450ff7d..bd3fedae2 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -417,7 +417,7 @@ where let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let mut _entered_post_merge = None; - while let Some(action) = writer_receiver.recv() { + while let Some(action) = writer_receiver.recv_action() { if _entered_post_merge.is_none() && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) { @@ -556,7 +556,7 @@ fn write_from_bbqueue( arroy_writers: &HashMap, aligned_embedding: &mut Vec, ) -> crate::Result<()> { - while let Some(frame_with_header) = writer_receiver.read() { + while let Some(frame_with_header) = writer_receiver.recv_frame() { match frame_with_header.header() { EntryHeader::DbOperation(operation) => { let database_name = operation.database.database_name(); From 5df5eb2db26159f79a0cedaea575bb9a79e098c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:10:48 +0100 Subject: [PATCH 052/158] Clarify a method name --- crates/milli/src/update/new/channel.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 82e483d18..7b083341b 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -215,7 +215,7 @@ pub struct ArroySetVectors { } impl ArroySetVectors { - fn remaining_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { let skip = EntryHeader::variant_size() + mem::size_of::(); &frame[skip..] } @@ -227,7 +227,7 @@ impl ArroySetVectors { vec: &'v mut Vec, ) -> &'v [f32] { vec.clear(); - Self::remaining_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { + Self::embeddings_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); vec.push(f); }); From f7f9a131e400bc995d7ef152e559b1e70ecd85e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:15:58 +0100 Subject: [PATCH 053/158] Improve copying bytes into aligned memory area --- crates/milli/src/update/new/channel.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7b083341b..7a997c3af 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -226,11 +226,10 @@ impl ArroySetVectors { frame: &FrameGrantR<'_>, vec: &'v mut Vec, ) -> &'v [f32] { - vec.clear(); - Self::embeddings_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); &vec[..] } } From be7d2fbe63070066538a6450d5e46803990169b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:19:11 +0100 Subject: [PATCH 054/158] Move the EntryHeader up in the file and document the safety related to the size --- crates/milli/src/update/new/channel.rs | 128 +++++++++++++------------ 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7a997c3af..bebaad686 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -172,68 +172,10 @@ impl<'a> From> for FrameWithHeader<'a> { } } -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// Wether a put of the key/value pair or a delete of the given key. -pub struct DbOperation { - /// The database on which to perform the operation. - pub database: Database, - /// The key length in the buffer. - /// - /// If None it means that the buffer is dedicated - /// to the key and it is therefore a deletion operation. - pub key_length: Option, -} - -impl DbOperation { - pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { - let skip = EntryHeader::variant_size() + mem::size_of::(); - match self.key_length { - Some(key_length) => { - let (key, value) = frame[skip..].split_at(key_length.get() as usize); - (key, Some(value)) - } - None => (&frame[skip..], None), - } - } -} - -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(transparent)] -pub struct ArroyDeleteVector { - pub docid: DocumentId, -} - -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// The embeddings are in the remaining space and represents -/// non-aligned [f32] each with dimensions f32s. -pub struct ArroySetVectors { - pub docid: DocumentId, - pub embedder_id: u8, - _padding: [u8; 3], -} - -impl ArroySetVectors { - fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { - let skip = EntryHeader::variant_size() + mem::size_of::(); - &frame[skip..] - } - - /// Read all the embeddings and write them into an aligned `f32` Vec. - pub fn read_all_embeddings_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - vec: &'v mut Vec, - ) -> &'v [f32] { - let embeddings_bytes = Self::embeddings_bytes(frame); - let embeddings_count = embeddings_bytes.len() / mem::size_of::(); - vec.resize(embeddings_count, 0.0); - bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); - &vec[..] - } -} - +/// A header that is written at the beginning of a bbqueue frame. +/// +/// Note that the different variants cannot be changed without taking +/// care of their size in the implementation, like, everywhere. #[derive(Debug, Clone, Copy)] #[repr(u8)] pub enum EntryHeader { @@ -319,6 +261,68 @@ impl EntryHeader { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVectors { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(u16)] pub enum Database { From 263c5a348ee321559b8b98789d70be9950d6ec83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:33:49 +0100 Subject: [PATCH 055/158] Move the spin looping for BBQueue frames into a dedicated function --- Cargo.lock | 13 +++++ crates/milli/Cargo.toml | 1 + crates/milli/src/update/new/channel.rs | 79 ++++++++++++-------------- 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a0a6b3d0..038b269ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1910,6 +1910,15 @@ dependencies = [ "serde_json", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "spin", +] + [[package]] name = "fnv" version = "1.0.7" @@ -3623,6 +3632,7 @@ dependencies = [ "enum-iterator", "filter-parser", "flatten-serde-json", + "flume", "fst", "fxhash", "geoutils", @@ -5180,6 +5190,9 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] [[package]] name = "spm_precompiled" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index b66dec9a4..a88401470 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -99,6 +99,7 @@ rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" bbqueue = { git = "https://github.com/kerollmops/bbqueue" } +flume = { version = "0.11.1", default-features = false } [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index bebaad686..e8bb6930c 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -4,10 +4,10 @@ use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; -use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; -use crossbeam_channel::SendError; +use flume::SendError; use heed::types::Bytes; use heed::BytesDecode; use memmap2::{Mmap, MmapMut}; @@ -33,7 +33,7 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through -/// a crossbeam channel. This parameter must stay low to make +/// a flume channel. This parameter must stay low to make /// sure we do not use too much memory. /// /// Note that the channel is also used to wake-up the receiver @@ -61,7 +61,7 @@ pub fn extractor_writer_bbqueue( consumer }); - let (sender, receiver) = crossbeam_channel::bounded(channel_capacity); + let (sender, receiver) = flume::bounded(channel_capacity); let sender = ExtractorBbqueueSender { sender, producers, capacity }; let receiver = WriterBbqueueReceiver { receiver, consumers }; (sender, receiver) @@ -70,7 +70,7 @@ pub fn extractor_writer_bbqueue( pub struct ExtractorBbqueueSender<'a> { /// This channel is used to wake-up the receiver and /// send large entries that cannot fit in the BBQueue. - sender: crossbeam_channel::Sender, + sender: flume::Sender, /// A memory buffer, one by thread, is used to serialize /// the entries directly in this shared, lock-free space. producers: ThreadLocal>>>, @@ -87,7 +87,7 @@ pub struct WriterBbqueueReceiver<'a> { /// Used to wake up when new entries are available either in /// any BBQueue buffer or directly sent throught this channel /// (still written to disk). - receiver: crossbeam_channel::Receiver, + receiver: flume::Receiver, /// The BBQueue frames to read when waking-up. consumers: Vec>, } @@ -437,19 +437,9 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; - + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); payload_header.serialize_into(&mut grant); - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -494,13 +484,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -571,13 +555,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -585,9 +563,6 @@ impl<'b> ExtractorBbqueueSender<'b> { let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); key_value_writer(key_buffer, value_buffer)?; - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -628,22 +603,13 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); key_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -654,6 +620,31 @@ impl<'b> ExtractorBbqueueSender<'b> { } } +/// Try to reserve a frame grant of `total_length` by spin looping +/// on the BBQueue buffer and panics if the receiver has been disconnected. +fn reserve_grant<'b>( + producer: &mut FrameProducer<'b>, + total_length: usize, + sender: &flume::Sender, +) -> FrameGrantW<'b> { + loop { + for _ in 0..10_000 { + match producer.grant(total_length) { + Ok(mut grant) => { + // We could commit only the used memory. + grant.to_commit(total_length); + return grant; + } + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + } + if sender.is_disconnected() { + panic!("channel is disconnected"); + } + } +} + pub enum ExactWordDocids {} pub enum FidWordCountDocids {} pub enum WordDocids {} From bcab61ab1d83738710a69766bf4c3723b1596906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:42:47 +0100 Subject: [PATCH 056/158] Do spurious wake ups on the receiver side --- crates/milli/src/update/new/channel.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index e8bb6930c..631fcf74e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -3,11 +3,12 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; +use std::time::Duration; use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; -use flume::SendError; +use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; use heed::BytesDecode; use memmap2::{Mmap, MmapMut}; @@ -136,10 +137,24 @@ impl LargeVectors { } impl<'a> WriterBbqueueReceiver<'a> { + /// Tries to receive an action to do until the timeout occurs + /// and if it does, consider it as a spurious wake up. pub fn recv_action(&mut self) -> Option { - self.receiver.recv().ok() + match self.receiver.recv_timeout(Duration::from_millis(100)) { + Ok(action) => Some(action), + Err(RecvTimeoutError::Timeout) => Some(ReceiverAction::WakeUp), + Err(RecvTimeoutError::Disconnected) => None, + } } + /// Reads all the BBQueue buffers and selects the first available frame. + /// + /// Note: Selecting the first available frame gives preference to + /// frames that will be cleaned up first. It may result in the + /// last frames being more likely to fill up. One potential optimization + /// could involve keeping track of the last processed BBQueue index + /// to cycle through the frames instead of always starting from the + /// beginning. pub fn recv_frame(&mut self) -> Option> { for consumer in &mut self.consumers { if let Some(frame) = consumer.read() { From 5e218f3f4daf1594580eb377183770fd4a206a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:44:42 +0100 Subject: [PATCH 057/158] Remove a sync_all (mark my words) --- crates/milli/src/update/new/channel.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 631fcf74e..219f20854 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -489,7 +489,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; - value_file.sync_all()?; let embeddings = unsafe { Mmap::map(&value_file)? }; let large_vectors = LargeVectors { docid, embedder_id, embeddings }; From d5c07ef7b310f8af30a6d5ac0ea2b0da93241709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:02:49 +0100 Subject: [PATCH 058/158] Manage key length conversion error correctly --- crates/milli/src/error.rs | 10 ++-- crates/milli/src/update/new/channel.rs | 53 ++++++++++++++++++---- crates/milli/src/update/new/indexer/mod.rs | 2 +- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 800dfa375..a6774a7bd 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -3,6 +3,7 @@ use std::convert::Infallible; use std::fmt::Write; use std::{io, str}; +use bstr::BString; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use rhai::EvalAltResult; @@ -62,14 +63,9 @@ pub enum InternalError { #[error(transparent)] Store(#[from] MdbError), #[error("Cannot delete {key:?} from database {database_name}: {error}")] - StoreDeletion { database_name: &'static str, key: Box<[u8]>, error: heed::Error }, + StoreDeletion { database_name: &'static str, key: BString, error: heed::Error }, #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] - StorePut { - database_name: &'static str, - key: Box<[u8]>, - value_length: usize, - error: heed::Error, - }, + StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 219f20854..b0a61bd7f 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -10,7 +10,7 @@ use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; -use heed::BytesDecode; +use heed::{BytesDecode, MdbError}; use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; @@ -23,7 +23,7 @@ use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{CboRoaringBitmapCodec, DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Index, InternalError}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. @@ -524,7 +524,14 @@ impl<'b> ExtractorBbqueueSender<'b> { } fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length: value.len(), + error: MdbError::BadValSize.into(), + } + })?; self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { key_buffer.copy_from_slice(key); value_buffer.copy_from_slice(value); @@ -587,7 +594,13 @@ impl<'b> ExtractorBbqueueSender<'b> { } fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StoreDeletion { + database_name: database.database_name(), + key: key.into(), + error: MdbError::BadValSize.into(), + } + })?; self.delete_entry_with(database, key_length, |buffer| { buffer.copy_from_slice(key); Ok(()) @@ -702,8 +715,15 @@ pub struct WordDocidsSender<'a, 'b, D> { impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: D::DATABASE.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.sender.write_key_value_with( D::DATABASE, key_length, @@ -731,7 +751,6 @@ impl FacetDocidsSender<'_, '_> { let (facet_kind, key) = FacetKind::extract_from_key(key); let database = Database::from(facet_kind); - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); let value_length = match facet_kind { // We must take the facet group size into account @@ -739,6 +758,14 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Number | FacetKind::String => value_length + 1, FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.sender.write_key_value_with( database, @@ -862,12 +889,20 @@ impl GeoSender<'_, '_> { } pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { - let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let database = Database::Main; let value_length = bitmap.serialized_size(); + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.0.write_key_value_with( - Database::Main, + database, key_length, value_length, |key_buffer, value_buffer| { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index bd3fedae2..7262c65cb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -432,7 +432,7 @@ where if let Err(error) = database.put(wtxn, &key, &value) { return Err(Error::InternalError(InternalError::StorePut { database_name, - key, + key: bstr::BString::from(&key[..]), value_length: value.len(), error, })); From e9f34fb4b1d8ec674818218009055f21cb2e68e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:49:01 +0100 Subject: [PATCH 059/158] Make the frame consumer pulling fair --- crates/milli/src/update/new/channel.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index b0a61bd7f..a2f16983e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,8 +1,10 @@ use std::cell::RefCell; use std::io::{self, BufWriter}; +use std::iter::Cycle; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; +use std::ops::Range; use std::time::Duration; use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; @@ -64,7 +66,11 @@ pub fn extractor_writer_bbqueue( let (sender, receiver) = flume::bounded(channel_capacity); let sender = ExtractorBbqueueSender { sender, producers, capacity }; - let receiver = WriterBbqueueReceiver { receiver, consumers }; + let receiver = WriterBbqueueReceiver { + receiver, + look_at_consumer: (0..consumers.len()).cycle(), + consumers, + }; (sender, receiver) } @@ -89,6 +95,9 @@ pub struct WriterBbqueueReceiver<'a> { /// any BBQueue buffer or directly sent throught this channel /// (still written to disk). receiver: flume::Receiver, + /// Indicates the consumer to observe. This cycling range + /// ensures fair distribution of work among consumers. + look_at_consumer: Cycle>, /// The BBQueue frames to read when waking-up. consumers: Vec>, } @@ -148,16 +157,9 @@ impl<'a> WriterBbqueueReceiver<'a> { } /// Reads all the BBQueue buffers and selects the first available frame. - /// - /// Note: Selecting the first available frame gives preference to - /// frames that will be cleaned up first. It may result in the - /// last frames being more likely to fill up. One potential optimization - /// could involve keeping track of the last processed BBQueue index - /// to cycle through the frames instead of always starting from the - /// beginning. pub fn recv_frame(&mut self) -> Option> { - for consumer in &mut self.consumers { - if let Some(frame) = consumer.read() { + for index in self.look_at_consumer.by_ref().take(self.consumers.len()) { + if let Some(frame) = self.consumers[index].read() { return Some(FrameWithHeader::from(frame)); } } @@ -511,9 +513,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } } - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { From 767259be7e5e7c8a69a802ddae9a434e349849e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:53:42 +0100 Subject: [PATCH 060/158] Prefer returning a abort indexation rather than throwing a panic --- crates/milli/src/update/new/channel.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index a2f16983e..b749eb7fe 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -25,7 +25,7 @@ use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{CboRoaringBitmapCodec, DocumentId, Index, InternalError}; +use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. @@ -454,7 +454,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; payload_header.serialize_into(&mut grant); // We only send a wake up message when the channel is empty @@ -500,7 +500,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -575,7 +575,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -629,7 +629,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -652,21 +652,21 @@ fn reserve_grant<'b>( producer: &mut FrameProducer<'b>, total_length: usize, sender: &flume::Sender, -) -> FrameGrantW<'b> { +) -> crate::Result> { loop { for _ in 0..10_000 { match producer.grant(total_length) { Ok(mut grant) => { // We could commit only the used memory. grant.to_commit(total_length); - return grant; + return Ok(grant); } Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), } } if sender.is_disconnected() { - panic!("channel is disconnected"); + return Err(Error::InternalError(InternalError::AbortedIndexation)); } } } From a439fa3e1adab396074bd6387f16b081c50499ef Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 2 Dec 2024 12:02:16 +0100 Subject: [PATCH 061/158] While spamming the batches route we could see a processing batch becoming missing and then finished, this commit ensures the batches goes from processing to finished directly --- crates/index-scheduler/src/lib.rs | 9 +++++---- crates/meilisearch/tests/batches/mod.rs | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index cef24c1ea..f2510f1f9 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1738,11 +1738,8 @@ impl IndexScheduler { } } - self.processing_tasks.write().unwrap().stop_processing(); // We must re-add the canceled task so they're part of the same batch. - // processed.processing |= canceled; ids |= canceled; - self.write_batch(&mut wtxn, processing_batch, &ids)?; #[cfg(test)] @@ -1750,8 +1747,12 @@ impl IndexScheduler { wtxn.commit().map_err(Error::HeedTransaction)?; + // We should stop processing AFTER everything is processed and written to disk otherwise, a batch (which only lives in RAM) may appear in the processing task + // and then become « not found » for some time until the commit everything is written and the final commit is made. + self.processing_tasks.write().unwrap().stop_processing(); + // Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart - tracing::debug!("Deleting the update files"); + // tracing::debug!("Deleting the update files"); //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap let idx = AtomicU32::new(0); diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index 799aa3df7..9c869c140 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -224,7 +224,7 @@ async fn list_batches_status_and_type_filtered() { } #[actix_rt::test] -async fn get_batch_filter_error() { +async fn list_batch_filter_error() { let server = Server::new().await; let (response, code) = server.batches_filter("lol=pied").await; From d78f4666a0ec5f645317ea07a07c324a399bd8ca Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 12:25:01 +0100 Subject: [PATCH 062/158] Fix autobatching of documents and settings --- crates/index-scheduler/src/autobatcher.rs | 82 ++--------------------- crates/index-scheduler/src/batch.rs | 61 ----------------- crates/index-scheduler/src/utils.rs | 2 +- 3 files changed, 5 insertions(+), 140 deletions(-) diff --git a/crates/index-scheduler/src/autobatcher.rs b/crates/index-scheduler/src/autobatcher.rs index 0f6aa8a3a..7ce5717f5 100644 --- a/crates/index-scheduler/src/autobatcher.rs +++ b/crates/index-scheduler/src/autobatcher.rs @@ -115,13 +115,6 @@ pub enum BatchKind { allow_index_creation: bool, settings_ids: Vec, }, - SettingsAndDocumentOperation { - settings_ids: Vec, - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option, - operation_ids: Vec, - }, Settings { allow_index_creation: bool, settings_ids: Vec, @@ -146,7 +139,6 @@ impl BatchKind { match self { BatchKind::DocumentOperation { allow_index_creation, .. } | BatchKind::ClearAndSettings { allow_index_creation, .. } - | BatchKind::SettingsAndDocumentOperation { allow_index_creation, .. } | BatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), _ => None, } @@ -154,10 +146,7 @@ impl BatchKind { fn primary_key(&self) -> Option> { match self { - BatchKind::DocumentOperation { primary_key, .. } - | BatchKind::SettingsAndDocumentOperation { primary_key, .. } => { - Some(primary_key.as_deref()) - } + BatchKind::DocumentOperation { primary_key, .. } => Some(primary_key.as_deref()), _ => None, } } @@ -275,8 +264,7 @@ impl BatchKind { Break(BatchKind::IndexDeletion { ids }) } ( - BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other } - | BatchKind::SettingsAndDocumentOperation { operation_ids: mut ids, method: _, allow_index_creation: _, primary_key: _, settings_ids: mut other }, + BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other }, K::IndexDeletion, ) => { ids.push(id); @@ -356,15 +344,9 @@ impl BatchKind { ) => Break(this), ( - BatchKind::DocumentOperation { method, allow_index_creation, primary_key, operation_ids }, + this @ BatchKind::DocumentOperation { .. }, K::Settings { .. }, - ) => Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids: vec![id], - method, - allow_index_creation, - primary_key, - operation_ids, - }), + ) => Break(this), (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: _ }, K::DocumentClear) => { deletion_ids.push(id); @@ -477,63 +459,7 @@ impl BatchKind { allow_index_creation, }) } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: _, mut operation_ids, allow_index_creation, primary_key: _ }, - K::DocumentClear, - ) => { - operation_ids.push(id); - Continue(BatchKind::ClearAndSettings { - settings_ids, - other: operation_ids, - allow_index_creation, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: ReplaceDocuments, mut operation_ids, allow_index_creation, primary_key: _}, - K::DocumentImport { method: ReplaceDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: ReplaceDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: UpdateDocuments, allow_index_creation, primary_key: _, mut operation_ids }, - K::DocumentImport { method: UpdateDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: UpdateDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - // But we can't batch a settings and a doc op with another doc op - // this MUST be AFTER the two previous branch - ( - this @ BatchKind::SettingsAndDocumentOperation { .. }, - K::DocumentDeletion { .. } | K::DocumentImport { .. }, - ) => Break(this), - ( - BatchKind::SettingsAndDocumentOperation { mut settings_ids, method, allow_index_creation,primary_key, operation_ids }, - K::Settings { .. }, - ) => { - settings_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - }) - } ( BatchKind::IndexCreation { .. } | BatchKind::IndexDeletion { .. } diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 04cdb912f..5a1ed3aa7 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -441,67 +441,6 @@ impl IndexScheduler { must_create_index, })) } - BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - } => { - let settings = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::Settings { settings_ids, allow_index_creation }, - current_batch, - must_create_index, - )?; - - let document_import = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids, - }, - current_batch, - must_create_index, - )?; - - match (document_import, settings) { - ( - Some(Batch::IndexOperation { - op: - IndexOperation::DocumentOperation { - primary_key, - documents_counts, - operations, - tasks: document_import_tasks, - .. - }, - .. - }), - Some(Batch::IndexOperation { - op: IndexOperation::Settings { settings, tasks: settings_tasks, .. }, - .. - }), - ) => Ok(Some(Batch::IndexOperation { - op: IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - }, - must_create_index, - })), - _ => unreachable!(), - } - } BatchKind::IndexCreation { id } => { let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; current_batch.processing(Some(&mut task)); diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 1ca782f8c..fc41d535c 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -106,7 +106,7 @@ impl ProcessingBatch { self.stats.total_nb_tasks = 0; } - /// Update the timestamp of the tasks and the inner structure of this sturcture. + /// Update the timestamp of the tasks and the inner structure of this structure. pub fn update(&mut self, task: &mut Task) { // We must re-set this value in case we're dealing with a task that has been added between // the `processing` and `finished` state From 6a1d26a60c867f1a239ffc52dc91846fc28a8b88 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 14:15:15 +0100 Subject: [PATCH 063/158] Update autobatching tests --- crates/index-scheduler/src/autobatcher.rs | 90 ++++++----------------- 1 file changed, 23 insertions(+), 67 deletions(-) diff --git a/crates/index-scheduler/src/autobatcher.rs b/crates/index-scheduler/src/autobatcher.rs index 7ce5717f5..5950e2b13 100644 --- a/crates/index-scheduler/src/autobatcher.rs +++ b/crates/index-scheduler/src/autobatcher.rs @@ -734,30 +734,30 @@ mod tests { } #[test] - fn document_addition_batch_with_settings() { + fn document_addition_doesnt_batch_with_settings() { // simple case - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); // multiple settings and doc addition - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); // addition and setting unordered - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - // We ensure this kind of batch doesn't batch with forbidden operations - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + // Doesn't batch with other forbidden operations + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); } #[test] @@ -785,8 +785,8 @@ mod tests { debug_snapshot!(autobatch_from(true, None, [doc_clr(), settings(true)]), @"Some((DocumentClear { ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [settings(true), doc_clr(), settings(true)]), @"Some((ClearAndSettings { other: [1], allow_index_creation: true, settings_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); } #[test] @@ -833,50 +833,6 @@ mod tests { debug_snapshot!(autobatch_from(false,None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); debug_snapshot!(autobatch_from(false,None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(false,None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - - // Then the mixed cases. - // The index already exists, whatever is the right of the tasks it shouldn't change the result. - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - - // When the index doesn't exists yet it's more complicated. - // Either the first task we encounter create it, in which case we can create a big batch with everything. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // The right of the tasks following isn't really important. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // Or, the second case; the first task doesn't create the index and thus we wants to batch it with only tasks that can't create an index. - // that can be a second task that don't have the right to create an index. Or anything that can't create an index like an index deletion, document deletion, document clear, etc. - // All theses tasks are going to throw an error `Index doesn't exist` once the batch is processed. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - // The third and final case is when the first task doesn't create an index but is directly followed by a task creating an index. In this case we can't batch whit what - // follows because we first need to process the erronous batch. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); } #[test] @@ -885,13 +841,13 @@ mod tests { debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); // batch deletion and addition From 057143214d9d846b95a96999025d7ace377f39f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 14:29:52 +0100 Subject: [PATCH 064/158] Fix warnings --- crates/index-scheduler/src/batch.rs | 75 ++--------------------------- 1 file changed, 3 insertions(+), 72 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 5a1ed3aa7..8e35ec6ac 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -104,7 +104,6 @@ pub(crate) enum IndexOperation { index_uid: String, primary_key: Option, method: IndexDocumentsMethod, - documents_counts: Vec, operations: Vec, tasks: Vec, }, @@ -130,19 +129,6 @@ pub(crate) enum IndexOperation { index_uid: String, cleared_tasks: Vec, - // The boolean indicates if it's a settings deletion or creation. - settings: Vec<(bool, Settings)>, - settings_tasks: Vec, - }, - SettingsAndDocumentOperation { - index_uid: String, - - primary_key: Option, - method: IndexDocumentsMethod, - documents_counts: Vec, - operations: Vec, - document_import_tasks: Vec, - // The boolean indicates if it's a settings deletion or creation. settings: Vec<(bool, Settings)>, settings_tasks: Vec, @@ -174,12 +160,7 @@ impl Batch { IndexOperation::DocumentEdition { task, .. } => { RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } - IndexOperation::SettingsAndDocumentOperation { - document_import_tasks: tasks, - settings_tasks: other, - .. - } - | IndexOperation::DocumentClearAndSetting { + IndexOperation::DocumentClearAndSetting { cleared_tasks: tasks, settings_tasks: other, .. @@ -239,8 +220,7 @@ impl IndexOperation { | IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } - | IndexOperation::DocumentClearAndSetting { index_uid, .. } - | IndexOperation::SettingsAndDocumentOperation { index_uid, .. } => index_uid, + | IndexOperation::DocumentClearAndSetting { index_uid, .. } => index_uid, } } } @@ -262,9 +242,6 @@ impl fmt::Display for IndexOperation { IndexOperation::DocumentClearAndSetting { .. } => { f.write_str("IndexOperation::DocumentClearAndSetting") } - IndexOperation::SettingsAndDocumentOperation { .. } => { - f.write_str("IndexOperation::SettingsAndDocumentOperation") - } } } } @@ -330,21 +307,14 @@ impl IndexScheduler { }) .flatten(); - let mut documents_counts = Vec::new(); let mut operations = Vec::new(); for task in tasks.iter() { match task.kind { - KindWithContent::DocumentAdditionOrUpdate { - content_file, - documents_count, - .. - } => { - documents_counts.push(documents_count); + KindWithContent::DocumentAdditionOrUpdate { content_file, .. } => { operations.push(DocumentOperation::Add(content_file)); } KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents_counts.push(documents_ids.len() as u64); operations.push(DocumentOperation::Delete(documents_ids.clone())); } _ => unreachable!(), @@ -356,7 +326,6 @@ impl IndexScheduler { index_uid, primary_key, method, - documents_counts, operations, tasks, }, @@ -1243,7 +1212,6 @@ impl IndexScheduler { index_uid: _, primary_key, method, - documents_counts: _, operations, mut tasks, } => { @@ -1633,43 +1601,6 @@ impl IndexScheduler { Ok(tasks) } - IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - } => { - let settings_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::Settings { - index_uid: index_uid.clone(), - settings, - tasks: settings_tasks, - }, - )?; - - let mut import_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::DocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - tasks: document_import_tasks, - }, - )?; - - let mut tasks = settings_tasks; - tasks.append(&mut import_tasks); - Ok(tasks) - } IndexOperation::DocumentClearAndSetting { index_uid, cleared_tasks, From beeb31ce41e31074e1c85367684e0d78d8d008c1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 2 Dec 2024 15:32:16 +0100 Subject: [PATCH 065/158] Update crates/index-scheduler/src/lib.rs --- crates/index-scheduler/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index f2510f1f9..c719bb35e 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1752,7 +1752,7 @@ impl IndexScheduler { self.processing_tasks.write().unwrap().stop_processing(); // Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart - // tracing::debug!("Deleting the update files"); + tracing::debug!("Deleting the update files"); //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap let idx = AtomicU32::new(0); From d040aff10124c72b4785f295163189943704fb4d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 16:30:14 +0100 Subject: [PATCH 066/158] Stop allocating 1GiB for documents --- crates/meilisearch-types/src/document_formats.rs | 2 +- crates/milli/src/update/new/indexer/document_changes.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 096349448..008be4022 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -214,7 +214,7 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB + let mut doc_alloc = Bump::with_capacity(1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index bfb369680..2a5c25525 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -70,7 +70,7 @@ impl< F: FnOnce(&'extractor Bump) -> Result, { let doc_alloc = - doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); let doc_alloc = doc_alloc.0.take(); let fields_ids_map = fields_ids_map_store .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); From e905a72d731f0e6dc581d9b7ad02b94e594aa94e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 18:13:56 +0100 Subject: [PATCH 067/158] remove mimalloc on Windows --- crates/benchmarks/benches/indexing.rs | 1 + crates/benchmarks/benches/search_geo.rs | 1 + crates/benchmarks/benches/search_songs.rs | 1 + crates/benchmarks/benches/search_wiki.rs | 1 + crates/meilisearch/src/main.rs | 4 ++-- crates/milli/src/lib.rs | 1 + 6 files changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index d3f307be3..870e56686 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -16,6 +16,7 @@ use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_geo.rs b/crates/benchmarks/benches/search_geo.rs index faea4e3e0..72503ce57 100644 --- a/crates/benchmarks/benches/search_geo.rs +++ b/crates/benchmarks/benches/search_geo.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_songs.rs b/crates/benchmarks/benches/search_songs.rs index a1245528f..bef014a0e 100644 --- a/crates/benchmarks/benches/search_songs.rs +++ b/crates/benchmarks/benches/search_songs.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_wiki.rs b/crates/benchmarks/benches/search_wiki.rs index b792c2645..24eb5c8d1 100644 --- a/crates/benchmarks/benches/search_wiki.rs +++ b/crates/benchmarks/benches/search_wiki.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index c0652bf1e..b4b46bec4 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -20,14 +20,14 @@ use meilisearch::{ LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; -use mimalloc::MiMalloc; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt as _; use tracing_subscriber::Layer; +#[cfg(not(windows))] #[global_allocator] -static ALLOC: MiMalloc = MiMalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 48b03b6cc..1fc876f79 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::type_complexity)] +#[cfg(not(windows))] #[cfg(test)] #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; From 71d53f413fe06273068ef17313e3f88cf7b95d81 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 3 Dec 2024 11:07:03 +0100 Subject: [PATCH 068/158] increase the margin allowed to delete task --- crates/index-scheduler/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index d6de9c758..e071c4cc0 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1440,7 +1440,7 @@ impl IndexScheduler { // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty()) - && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50 + && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40 { return Err(Error::NoSpaceLeftInTaskQueue); } From 0ad2f57a9215f1028778d8e3668c3cb7f32709f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Dec 2024 11:35:45 +0100 Subject: [PATCH 069/158] Update bbqueue repo to point to the meilisearch org --- Cargo.lock | 2 +- crates/milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 038b269ce..3c2fb711e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -492,7 +492,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bbqueue" version = "0.5.1" -source = "git+https://github.com/kerollmops/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" +source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" [[package]] name = "benchmarks" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index a88401470..2a959b654 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -98,7 +98,7 @@ allocator-api2 = "0.2.18" rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" -bbqueue = { git = "https://github.com/kerollmops/bbqueue" } +bbqueue = { git = "https://github.com/meilisearch/bbqueue" } flume = { version = "0.11.1", default-features = false } [dev-dependencies] From 8ecb726683bca6a2e2c837db8c187ddbe39554f6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 3 Dec 2024 15:49:11 +0100 Subject: [PATCH 070/158] Fix the minimun BBQueue channel threshold --- crates/milli/src/update/new/indexer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 7262c65cb..383823de1 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -86,9 +86,9 @@ where (grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default |max_memory| { // 2% of the indexing memory - let total_bbbuffer_capacity = (max_memory / 100 / 2).min(minimum_capacity); + let total_bbbuffer_capacity = (max_memory / 100 / 2).max(minimum_capacity); let new_grenad_parameters = GrenadParameters { - max_memory: Some(max_memory - total_bbbuffer_capacity), + max_memory: Some(max_memory.saturating_sub(total_bbbuffer_capacity)), ..grenad_parameters }; (new_grenad_parameters, total_bbbuffer_capacity) From 0459b1a2420d40282a0a259f02f0aedd57db6514 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 10:32:25 +0100 Subject: [PATCH 071/158] Change the reserve and grant function to accept a closure --- crates/milli/src/update/new/channel.rs | 71 +++++++++++++++----------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index b749eb7fe..5675069d6 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -7,7 +7,7 @@ use std::num::NonZeroU16; use std::ops::Range; use std::time::Duration; -use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; +use bbqueue::framed::{FrameGrantR, FrameProducer}; use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; use flume::{RecvTimeoutError, SendError}; @@ -454,8 +454,10 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; - payload_header.serialize_into(&mut grant); + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + payload_header.serialize_into(grant); + Ok(()) + })?; // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. @@ -500,18 +502,20 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - - if dimensions != 0 { - let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); - for (embedding, output) in embeddings.iter().zip(output_iter) { - output.copy_from_slice(bytemuck::cast_slice(embedding)); + if dimensions != 0 { + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } } - } + + Ok(()) + })?; // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. @@ -575,13 +579,13 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; - - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); - key_value_writer(key_buffer, value_buffer)?; + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer) + })?; // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. @@ -629,12 +633,12 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; - - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - key_writer(remaining)?; + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining) + })?; // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. @@ -648,18 +652,23 @@ impl<'b> ExtractorBbqueueSender<'b> { /// Try to reserve a frame grant of `total_length` by spin looping /// on the BBQueue buffer and panics if the receiver has been disconnected. -fn reserve_grant<'b>( - producer: &mut FrameProducer<'b>, +fn reserve_and_write_grant( + producer: &mut FrameProducer, total_length: usize, sender: &flume::Sender, -) -> crate::Result> { + f: F, +) -> crate::Result<()> +where + F: FnOnce(&mut [u8]) -> crate::Result<()>, +{ loop { for _ in 0..10_000 { match producer.grant(total_length) { Ok(mut grant) => { // We could commit only the used memory. - grant.to_commit(total_length); - return Ok(grant); + f(&mut grant)?; + grant.commit(total_length); + return Ok(()); } Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), From 96831ed9bb9b2784a294f32f4665f16135347f27 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 11:03:01 +0100 Subject: [PATCH 072/158] Send the WakeUp message if necessary in the reserve function --- crates/milli/src/update/new/channel.rs | 36 +++++++------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 5675069d6..ebd0ba429 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -459,12 +459,6 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) })?; - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - Ok(()) } @@ -517,12 +511,6 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) })?; - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - Ok(()) } @@ -587,12 +575,6 @@ impl<'b> ExtractorBbqueueSender<'b> { key_value_writer(key_buffer, value_buffer) })?; - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - Ok(()) } @@ -640,18 +622,13 @@ impl<'b> ExtractorBbqueueSender<'b> { key_writer(remaining) })?; - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - Ok(()) } } -/// Try to reserve a frame grant of `total_length` by spin looping -/// on the BBQueue buffer and panics if the receiver has been disconnected. +/// Try to reserve a frame grant of `total_length` by spin +/// looping on the BBQueue buffer, panics if the receiver +/// has been disconnected or send a WakeUp message if necessary. fn reserve_and_write_grant( producer: &mut FrameProducer, total_length: usize, @@ -668,6 +645,13 @@ where // We could commit only the used memory. f(&mut grant)?; grant.commit(total_length); + + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if sender.is_empty() { + sender.send(ReceiverAction::WakeUp).unwrap(); + } + return Ok(()); } Err(bbqueue::Error::InsufficientSize) => continue, From 953a82ca04f64a6b3db1c421fc7ab778038357ea Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 11:15:29 +0100 Subject: [PATCH 073/158] Add new error message --- crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch/src/search/mod.rs | 7 ++++++ .../meilisearch/tests/search/facet_search.rs | 22 +++++++++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 4b930bf8d..c68059682 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -279,6 +279,7 @@ InvalidSearchPage , InvalidRequest , BAD_REQUEST ; InvalidSearchQ , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchQuery , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; +InvalidFacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 7e185e951..9e0c936b7 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1407,6 +1407,13 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + if !index.facet_search(&rtxn)? { + return Err(ResponseError::from_msg( + "The facet search is disabled for this index".to_string(), + Code::InvalidFacetSearchDisabled, + )); + } + // In the faceted search context, we want to use the intersection between the locales provided by the user // and the locales of the facet string. // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 8fbeae293..418cb4da4 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -221,8 +221,15 @@ async fn add_documents_and_deactivate_facet_search() { let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; - assert_eq!(code, 200, "{}", response); - assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0); + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "Facet search is disabled for this index", + "code": "invalid_search_disabled_facet_search", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_disabled_facet_search" + } + "###); } #[actix_rt::test] @@ -245,8 +252,15 @@ async fn deactivate_facet_search_and_add_documents() { let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; - assert_eq!(code, 200, "{}", response); - assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0); + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "Facet search is disabled for this index", + "code": "invalid_search_disabled_facet_search", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_disabled_facet_search" + } + "###); } #[actix_rt::test] From 5ce9acb0b9eb8878b6514f38ef8641867e4f3e01 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 12:19:19 +0100 Subject: [PATCH 074/158] Add workloads --- workloads/hackernews-add-new-documents.json | 106 +++++++++++++++ .../hackernews-modify-facet-numbers.json | 111 ++++++++++++++++ .../hackernews-modify-facet-strings.json | 111 ++++++++++++++++ workloads/hackernews-modify-searchables.json | 124 ++++++++++++++++++ 4 files changed, 452 insertions(+) create mode 100644 workloads/hackernews-add-new-documents.json create mode 100644 workloads/hackernews-modify-facet-numbers.json create mode 100644 workloads/hackernews-modify-facet-strings.json create mode 100644 workloads/hackernews-modify-searchables.json diff --git a/workloads/hackernews-add-new-documents.json b/workloads/hackernews-add-new-documents.json new file mode 100644 index 000000000..38e7747c0 --- /dev/null +++ b/workloads/hackernews-add-new-documents.json @@ -0,0 +1,106 @@ +{ + "name": "hackernews.add_new_documents", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-05.ndjson" + }, + "synchronous": "WaitForTask" + } + ] + } + \ No newline at end of file diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json new file mode 100644 index 000000000..84d94969b --- /dev/null +++ b/workloads/hackernews-modify-facet-numbers.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_numbers", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-02-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", + "sha256": "1fcb6f89ddeff51c3fe7b86b3574f894ff9859a76cf056ab7e7dacc72970dabb" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] + } + \ No newline at end of file diff --git a/workloads/hackernews-modify-facet-strings.json b/workloads/hackernews-modify-facet-strings.json new file mode 100644 index 000000000..f912558e8 --- /dev/null +++ b/workloads/hackernews-modify-facet-strings.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_strings", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", + "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] + } + \ No newline at end of file diff --git a/workloads/hackernews-modify-searchables.json b/workloads/hackernews-modify-searchables.json new file mode 100644 index 000000000..0f674ece0 --- /dev/null +++ b/workloads/hackernews-modify-searchables.json @@ -0,0 +1,124 @@ +{ + "name": "hackernews.modify_searchables", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-searchables.ndjson", + "sha256": "e5c08710c6af70031ac7212e0ba242c72ef29c8d4e1fce66c789544641452a7c" + }, + "hackernews-02-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-searchables.ndjson", + "sha256": "098b029851117087b1e26ccb7ac408eda9bba54c3008213a2880d6fab607346e" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + } + ] + } + \ No newline at end of file From 1a17e2e5727b9f98685176f6b14984655c245c9f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 13:57:06 +0100 Subject: [PATCH 075/158] fix formating --- workloads/hackernews-add-new-documents.json | 189 ++++++++------- .../hackernews-modify-facet-numbers.json | 200 ++++++++-------- .../hackernews-modify-facet-strings.json | 202 ++++++++-------- workloads/hackernews-modify-searchables.json | 219 +++++++++--------- 4 files changed, 404 insertions(+), 406 deletions(-) diff --git a/workloads/hackernews-add-new-documents.json b/workloads/hackernews-add-new-documents.json index 38e7747c0..0470a0792 100644 --- a/workloads/hackernews-add-new-documents.json +++ b/workloads/hackernews-add-new-documents.json @@ -1,106 +1,105 @@ { - "name": "hackernews.add_new_documents", - "run_count": 3, - "extra_cli_args": [], - "assets": { - "hackernews-01.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", - "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" - }, - "hackernews-02.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", - "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" - }, - "hackernews-03.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", - "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" - }, - "hackernews-04.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", - "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" - }, - "hackernews-05.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", - "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + "name": "hackernews.add_new_documents", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] } + }, + "synchronous": "WaitForTask" }, - "precommands": [ - { - "route": "indexes/movies/settings", - "method": "PATCH", - "body": { - "inline": { - "displayedAttributes": [ - "title", - "by", - "score", - "time", - "text" - ], - "searchableAttributes": [ - "title", - "text" - ], - "filterableAttributes": [ - "by", - "kids", - "parent" - ], - "sortableAttributes": [ - "score", - "time" - ] - } - }, - "synchronous": "WaitForTask" + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-02.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-03.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-04.ndjson" + "asset": "hackernews-05.ndjson" }, "synchronous": "WaitForTask" } - ], - "commands": [ - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-05.ndjson" - }, - "synchronous": "WaitForTask" - } - ] - } - \ No newline at end of file + ] +} diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json index 84d94969b..c0726aedd 100644 --- a/workloads/hackernews-modify-facet-numbers.json +++ b/workloads/hackernews-modify-facet-numbers.json @@ -1,111 +1,111 @@ { - "name": "hackernews.modify_facet_numbers", - "run_count": 3, - "extra_cli_args": [], - "assets": { - "hackernews-01.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", - "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" - }, - "hackernews-02.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", - "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" - }, - "hackernews-03.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", - "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" - }, - "hackernews-04.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", - "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" - }, - "hackernews-05.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", - "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" - }, - "hackernews-02-modified-filters.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", - "sha256": "1fcb6f89ddeff51c3fe7b86b3574f894ff9859a76cf056ab7e7dacc72970dabb" - } + "name": "hackernews.modify_facet_numbers", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" }, - "precommands": [ - { - "route": "indexes/movies/settings", - "method": "PATCH", - "body": { - "inline": { - "displayedAttributes": [ - "title", - "by", - "score", - "time", - "text" - ], - "searchableAttributes": [ - "title", - "text" - ], - "filterableAttributes": [ - "by", - "kids", - "parent" - ], - "sortableAttributes": [ - "score", - "time" - ] - } - }, - "synchronous": "WaitForTask" + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-02-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", + "sha256": "1fcb6f89ddeff51c3fe7b86b3574f894ff9859a76cf056ab7e7dacc72970dabb" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-02.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-03.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-04.ndjson" + "asset": "hackernews-01-modified-filters.ndjson" }, "synchronous": "WaitForTask" } - ], - "commands": [ - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-01-modified-filters.ndjson" - }, - "synchronous": "WaitForTask" - } - ] - } + ] +} \ No newline at end of file diff --git a/workloads/hackernews-modify-facet-strings.json b/workloads/hackernews-modify-facet-strings.json index f912558e8..7c5eb2e70 100644 --- a/workloads/hackernews-modify-facet-strings.json +++ b/workloads/hackernews-modify-facet-strings.json @@ -1,111 +1,111 @@ { - "name": "hackernews.modify_facet_strings", - "run_count": 3, - "extra_cli_args": [], - "assets": { - "hackernews-01.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", - "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" - }, - "hackernews-02.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", - "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" - }, - "hackernews-03.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", - "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" - }, - "hackernews-04.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", - "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" - }, - "hackernews-05.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", - "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" - }, - "hackernews-01-modified-filters.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", - "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" - } + "name": "hackernews.modify_facet_strings", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" }, - "precommands": [ - { - "route": "indexes/movies/settings", - "method": "PATCH", - "body": { - "inline": { - "displayedAttributes": [ - "title", - "by", - "score", - "time", - "text" - ], - "searchableAttributes": [ - "title", - "text" - ], - "filterableAttributes": [ - "by", - "kids", - "parent" - ], - "sortableAttributes": [ - "score", - "time" - ] - } - }, - "synchronous": "WaitForTask" + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", + "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-02.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-03.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-04.ndjson" + "asset": "hackernews-01-modified-filters.ndjson" }, "synchronous": "WaitForTask" } - ], - "commands": [ - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-01-modified-filters.ndjson" - }, - "synchronous": "WaitForTask" - } - ] - } - \ No newline at end of file + ] +} + \ No newline at end of file diff --git a/workloads/hackernews-modify-searchables.json b/workloads/hackernews-modify-searchables.json index 0f674ece0..248026f19 100644 --- a/workloads/hackernews-modify-searchables.json +++ b/workloads/hackernews-modify-searchables.json @@ -1,71 +1,113 @@ { - "name": "hackernews.modify_searchables", - "run_count": 3, - "extra_cli_args": [], - "assets": { - "hackernews-01.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", - "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" - }, - "hackernews-02.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", - "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" - }, - "hackernews-03.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", - "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" - }, - "hackernews-04.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", - "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" - }, - "hackernews-05.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", - "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" - }, - "hackernews-01-modified-searchables.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-searchables.ndjson", - "sha256": "e5c08710c6af70031ac7212e0ba242c72ef29c8d4e1fce66c789544641452a7c" - }, - "hackernews-02-modified-searchables.ndjson": { - "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-searchables.ndjson", - "sha256": "098b029851117087b1e26ccb7ac408eda9bba54c3008213a2880d6fab607346e" - } + "name": "hackernews.modify_searchables", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" }, - "precommands": [ + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-searchables.ndjson", + "sha256": "e5c08710c6af70031ac7212e0ba242c72ef29c8d4e1fce66c789544641452a7c" + }, + "hackernews-02-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-searchables.ndjson", + "sha256": "098b029851117087b1e26ccb7ac408eda9bba54c3008213a2880d6fab607346e" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ { - "route": "indexes/movies/settings", - "method": "PATCH", + "route": "indexes/movies/documents", + "method": "POST", "body": { - "inline": { - "displayedAttributes": [ - "title", - "by", - "score", - "time", - "text" - ], - "searchableAttributes": [ - "title", - "text" - ], - "filterableAttributes": [ - "by", - "kids", - "parent" - ], - "sortableAttributes": [ - "score", - "time" - ] - } + "asset": "hackernews-01-modified-searchables.ndjson" }, "synchronous": "WaitForTask" }, @@ -73,52 +115,9 @@ "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-02.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-03.ndjson" - }, - "synchronous": "WaitForResponse" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-04.ndjson" + "asset": "hackernews-02-modified-searchables.ndjson" }, "synchronous": "WaitForTask" } - ], - "commands": [ - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-01-modified-searchables.ndjson" - }, - "synchronous": "WaitForTask" - }, - { - "route": "indexes/movies/documents", - "method": "POST", - "body": { - "asset": "hackernews-02-modified-searchables.ndjson" - }, - "synchronous": "WaitForTask" - } - ] - } - \ No newline at end of file + ] +} From 261d2ceb06553465115419afed01dc4b0cbbf848 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 14:16:40 +0100 Subject: [PATCH 076/158] Yield the BBQueue writer instead of spin looping --- crates/milli/src/update/new/channel.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index ebd0ba429..7590c02ac 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -661,6 +661,11 @@ where if sender.is_disconnected() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } + + // We prefer to yield and allow the writing thread + // to do its job, especially beneficial when there + // is only one CPU core available. + std::thread::yield_now(); } } From fc1df5793cb5fa8d462081f0e4f1dad511a8746a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 14:35:20 +0100 Subject: [PATCH 077/158] fix tests --- crates/meilisearch/tests/search/facet_search.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 418cb4da4..23f312490 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -224,10 +224,10 @@ async fn add_documents_and_deactivate_facet_search() { assert_eq!(code, 400, "{}", response); snapshot!(response, @r###" { - "message": "Facet search is disabled for this index", - "code": "invalid_search_disabled_facet_search", + "message": "The facet search is disabled for this index", + "code": "invalid_facet_search_disabled", "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_disabled_facet_search" + "link": "https://docs.meilisearch.com/errors#invalid_facet_search_disabled" } "###); } @@ -255,10 +255,10 @@ async fn deactivate_facet_search_and_add_documents() { assert_eq!(code, 400, "{}", response); snapshot!(response, @r###" { - "message": "Facet search is disabled for this index", - "code": "invalid_search_disabled_facet_search", + "message": "The facet search is disabled for this index", + "code": "invalid_facet_search_disabled", "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_disabled_facet_search" + "link": "https://docs.meilisearch.com/errors#invalid_facet_search_disabled" } "###); } From 7458f0386c2259add91977f653c3d741d6809e08 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 14:44:57 +0100 Subject: [PATCH 078/158] fix asset name --- workloads/hackernews-modify-facet-numbers.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json index c0726aedd..59ade0561 100644 --- a/workloads/hackernews-modify-facet-numbers.json +++ b/workloads/hackernews-modify-facet-numbers.json @@ -102,7 +102,7 @@ "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01-modified-filters.ndjson" + "asset": "hackernews-02-modified-filters.ndjson" }, "synchronous": "WaitForTask" } From bf742d81cfb66c2e98e5b26b046d8708421574c2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 4 Dec 2024 14:47:02 +0100 Subject: [PATCH 079/158] add a test --- crates/index-scheduler/src/lib.rs | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index d6de9c758..5e0e4f97a 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4319,10 +4319,35 @@ mod tests { let proc = index_scheduler.processing_tasks.read().unwrap().clone(); let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) + let (mut batches, _) = index_scheduler + .get_batches_from_authorized_indexes(query.clone(), &AuthFilter::default()) .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,]"); // only the processing batch in the first tick + assert_eq!(batches.len(), 1); + batches[0].started_at = OffsetDateTime::UNIX_EPOCH; + // Insta cannot snapshot our batches because the batch stats contains an enum as key: https://github.com/mitsuhiko/insta/issues/689 + let batch = serde_json::to_string_pretty(&batches[0]).unwrap(); + snapshot!(batch, @r#" + { + "uid": 0, + "details": { + "primaryKey": "mouse" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "enqueued": 2 + }, + "types": { + "indexCreation": 2 + }, + "indexUids": { + "catto": 2 + } + }, + "startedAt": "1970-01-01T00:00:00Z", + "finishedAt": null + } + "#); let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; let (batches, _) = index_scheduler From cbcf6c9ba371614de222d03344cba7ab84ed7ab4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 4 Dec 2024 14:48:48 +0100 Subject: [PATCH 080/158] make the processing tasks as processing in a batch --- crates/index-scheduler/src/lib.rs | 2 +- crates/index-scheduler/src/utils.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 5e0e4f97a..2d953fc6e 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4335,7 +4335,7 @@ mod tests { "stats": { "totalNbTasks": 2, "status": { - "enqueued": 2 + "processing": 2 }, "types": { "indexCreation": 2 diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index fc41d535c..356d77b35 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -67,7 +67,7 @@ impl ProcessingBatch { task.batch_uid = Some(self.uid); // We don't store the statuses in the map since they're all enqueued but we must // still store them in the stats since that can be displayed. - *self.stats.status.entry(task.status).or_default() += 1; + *self.stats.status.entry(Status::Processing).or_default() += 1; self.kinds.insert(task.kind.as_kind()); *self.stats.types.entry(task.kind.as_kind()).or_default() += 1; From 8388698993050ed95c7ba9411590d7ec052c11b8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 15:09:10 +0100 Subject: [PATCH 081/158] Fix dat hash --- workloads/hackernews-modify-facet-numbers.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json index 59ade0561..f4171442f 100644 --- a/workloads/hackernews-modify-facet-numbers.json +++ b/workloads/hackernews-modify-facet-numbers.json @@ -31,7 +31,7 @@ "hackernews-02-modified-filters.ndjson": { "local_location": null, "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", - "sha256": "1fcb6f89ddeff51c3fe7b86b3574f894ff9859a76cf056ab7e7dacc72970dabb" + "sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802" } }, "precommands": [ From cb0c3a5aad0f3fa2ffcfe51a5a59480f8d3049ee Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 4 Dec 2024 15:43:05 +0100 Subject: [PATCH 082/158] stop adding one enqueued tasks to all unprioritized batches --- crates/index-scheduler/src/batch.rs | 3 +-- crates/index-scheduler/src/lib.rs | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index ce86c10ca..fc6fb194c 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -496,8 +496,7 @@ impl IndexScheduler { // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; - let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); + let task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; // If the task is not associated with any index, verify that it is an index swap and // create the batch directly. Otherwise, get the index name associated with the task diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 2d953fc6e..9715e9e2f 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4333,15 +4333,15 @@ mod tests { "primaryKey": "mouse" }, "stats": { - "totalNbTasks": 2, + "totalNbTasks": 1, "status": { - "processing": 2 + "processing": 1 }, "types": { - "indexCreation": 2 + "indexCreation": 1 }, "indexUids": { - "catto": 2 + "catto": 1 } }, "startedAt": "1970-01-01T00:00:00Z", From 7a2af06b1ec31b8b1dbd7918ecd3b655b0c31aa6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 4 Dec 2024 15:52:24 +0100 Subject: [PATCH 083/158] update the impacted snapshots --- .../lib.rs/cancel_mix_of_tasks/aborted_indexation.snap | 2 +- .../processing_second_task_cancel_enqueued.snap | 2 +- .../lib.rs/cancel_processing_dump/cancel_registered.snap | 2 +- .../lib.rs/cancel_processing_task/aborted_indexation.snap | 2 +- .../lib.rs/cancel_processing_task/cancel_task_registered.snap | 2 +- .../lib.rs/cancel_processing_task/initial_task_processing.snap | 2 +- .../lib.rs/document_addition/after_the_batch_creation.snap | 2 +- .../document_addition_batch_created.snap | 2 +- .../after_batch_succeeded.snap | 2 +- .../after_failing_to_commit.snap | 2 +- .../after_batch_creation.snap | 2 +- .../registered_the_second_task.snap | 2 +- .../registered_the_third_task.snap | 2 +- .../lib.rs/query_batches_simple/after-advancing-a-bit.snap | 2 +- .../lib.rs/swap_indexes/third_empty_swap_processed.snap | 3 +-- .../task_deletion_undeleteable/task_deletion_processing.snap | 2 +- 16 files changed, 16 insertions(+), 17 deletions(-) diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap index 9710c4911..b73714e36 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap index e70aa0850..c24c36313 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap index 55c7b3ed2..b9f33e598 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"dumpCreation":1},"indexUids":{}}, } +{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"dumpCreation":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap index 91b4deb22..0b9a0d709 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap index 89e8c8c6f..fef6c20f6 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap index 12e1b1283..3f45be007 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap b/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap index f7eaa6df8..8beb49145 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap index f7eaa6df8..8beb49145 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap index 0091af65b..8ab4d84dd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap index 0091af65b..8ab4d84dd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap index aafef2fce..9d3f29c48 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap index 86fea2386..322bcf4ab 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap index ea910f491..aa047e3ff 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap b/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap index 869e38e57..bf5d0528c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"doggo":2}}, } +{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap b/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap index 77b1193a5..0f126b33a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap @@ -89,7 +89,7 @@ succeeded [0,1,2,3,4,5,6,] ---------------------------------------------------------------------- ### Batches Kind: "indexCreation" [0,1,2,3,] -"indexSwap" [4,5,6,] +"indexSwap" [4,5,] ---------------------------------------------------------------------- ### Batches Index Tasks: a [0,4,5,] @@ -104,7 +104,6 @@ d [3,4,] [timestamp] [3,] [timestamp] [4,] [timestamp] [5,] -[timestamp] [6,] ---------------------------------------------------------------------- ### Batches Started At: [timestamp] [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap b/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap index fce223c6c..85a0afc46 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [3,] -{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"taskDeletion":1},"indexUids":{}}, } +{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"taskDeletion":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} From 739c52a3cdc420f929e45ce6189f18d624dc904f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Dec 2024 16:16:48 +0100 Subject: [PATCH 084/158] Replace HashSets by BTreeSets for the prefixes --- .../milli/src/update/new/word_fst_builder.rs | 12 +++---- .../src/update/new/words_prefix_docids.rs | 36 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crates/milli/src/update/new/word_fst_builder.rs b/crates/milli/src/update/new/word_fst_builder.rs index 6bc72d91d..a9a5222be 100644 --- a/crates/milli/src/update/new/word_fst_builder.rs +++ b/crates/milli/src/update/new/word_fst_builder.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; @@ -75,8 +75,8 @@ pub struct PrefixData { #[derive(Debug)] pub struct PrefixDelta { - pub modified: HashSet, - pub deleted: HashSet, + pub modified: BTreeSet, + pub deleted: BTreeSet, } struct PrefixFstBuilder { @@ -86,7 +86,7 @@ struct PrefixFstBuilder { prefix_fst_builders: Vec>>, current_prefix: Vec, current_prefix_count: Vec, - modified_prefixes: HashSet, + modified_prefixes: BTreeSet, current_prefix_is_modified: Vec, } @@ -110,7 +110,7 @@ impl PrefixFstBuilder { prefix_fst_builders, current_prefix: vec![Prefix::new(); max_prefix_length], current_prefix_count: vec![0; max_prefix_length], - modified_prefixes: HashSet::new(), + modified_prefixes: BTreeSet::new(), current_prefix_is_modified: vec![false; max_prefix_length], }) } @@ -180,7 +180,7 @@ impl PrefixFstBuilder { let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; let new_prefix_fst = Set::new(&prefix_fst_mmap)?; let old_prefix_fst = index.words_prefixes_fst(rtxn)?; - let mut deleted_prefixes = HashSet::new(); + let mut deleted_prefixes = BTreeSet::new(); { let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference(); while let Some(prefix) = deleted_prefixes_stream.next() { diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 7e56beeae..bf64049c3 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::{BufReader, BufWriter, Read, Seek, Write}; use hashbrown::HashMap; @@ -37,8 +37,8 @@ impl WordPrefixDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -48,7 +48,7 @@ impl WordPrefixDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -127,7 +127,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -173,8 +173,8 @@ impl WordPrefixIntegerDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -184,7 +184,7 @@ impl WordPrefixIntegerDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -262,7 +262,7 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -291,7 +291,7 @@ unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {} fn delete_prefixes( wtxn: &mut RwTxn, prefix_database: &Database, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We remove all the entries that are no more required in this word prefix docids database. for prefix in prefixes { @@ -309,8 +309,8 @@ fn delete_prefixes( pub fn compute_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( @@ -325,8 +325,8 @@ pub fn compute_word_prefix_docids( pub fn compute_exact_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( @@ -341,8 +341,8 @@ pub fn compute_exact_word_prefix_docids( pub fn compute_word_prefix_fid_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( @@ -357,8 +357,8 @@ pub fn compute_word_prefix_fid_docids( pub fn compute_word_prefix_position_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( From 29ef1645305b5b1f1d37011fec05f7c2b8ca66f7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 16:33:35 +0100 Subject: [PATCH 085/158] Introduce a new semi ordered merge function --- crates/milli/src/update/new/extract/cache.rs | 110 +++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index be077d142..ae5ade17e 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -566,6 +566,116 @@ where Ok(()) } +/// Merges the caches that must be all associated to the same bucket. +/// +/// It merges entries like the `merge_caches` function +pub fn merge_caches_alt(frozen: Vec, mut f: F) -> Result<()> +where + F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, +{ + let mut maps = Vec::new(); + let mut readers = Vec::new(); + let mut current_bucket = None; + for FrozenCache { bucket, cache, ref mut spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket), bucket); + maps.push(cache); + readers.append(spilled); + } + + // First manage the spilled entries by looking into the HashMaps, + // merge them and mark them as dummy. + let mut heap = BinaryHeap::new(); + for (source_index, source) in readers.into_iter().enumerate() { + let mut cursor = source.into_cursor()?; + if cursor.move_on_next()?.is_some() { + heap.push(Entry { cursor, source_index }); + } + } + + loop { + let mut first_entry = match heap.pop() { + Some(entry) => entry, + None => break, + }; + + let (first_key, first_value) = match first_entry.cursor.current() { + Some((key, value)) => (key, value), + None => break, + }; + + let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; + while let Some(mut entry) = heap.peek_mut() { + if let Some((key, _value)) = entry.cursor.current() { + if first_key == key { + let new = DelAddRoaringBitmap::from_bytes(first_value)?; + output = output.merge(new); + // When we are done we the current value of this entry move make + // it move forward and let the heap reorganize itself (on drop) + if entry.cursor.move_on_next()?.is_none() { + PeekMut::pop(entry); + } + } else { + break; + } + } + } + + // Once we merged all of the spilled bitmaps we must also + // fetch the entries from the non-spilled entries (the HashMaps). + for (map_index, map) in maps.iter_mut().enumerate() { + if first_entry.source_index != map_index { + if let Some(new) = map.get_mut(first_key) { + output.union_and_clear_bbbul(new); + } + } + } + + // We send the merged entry outside. + (f)(first_key, output)?; + + // Don't forget to put the first entry back into the heap. + if first_entry.cursor.move_on_next()?.is_some() { + heap.push(first_entry) + } + } + + // Then manage the content on the HashMap entries that weren't taken (mem::take). + let order_count = 1000; + while let Some(mut map) = maps.pop() { + let mut iter = map.iter_mut(); + + loop { + let mut ordered_buffer: Vec<_> = iter.by_ref().take(order_count).collect(); + ordered_buffer.sort_unstable_by_key(|(key, _)| *key); + + if ordered_buffer.is_empty() { + break; + } + + for (key, bbbul) in ordered_buffer.drain(..) { + // Make sure we don't try to work with entries already managed by the spilled + if bbbul.is_empty() { + continue; + } + + let mut output = DelAddRoaringBitmap::empty(); + output.union_and_clear_bbbul(bbbul); + + for rhs in maps.iter_mut() { + if let Some(new) = rhs.get_mut(key) { + output.union_and_clear_bbbul(new); + } + } + + // We send the merged entry outside. + (f)(key, output)?; + } + } + } + + Ok(()) +} + struct Entry { cursor: ReaderCursor, source_index: usize, From be411435f5248531f9b5b7891016e5e7304d5a83 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 16:37:29 +0100 Subject: [PATCH 086/158] Use the merge_caches_alt function in the docids merging --- crates/milli/src/update/new/extract/mod.rs | 5 ++++- crates/milli/src/update/new/merger.rs | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index e67f70db1..3601dd9c6 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -6,7 +6,10 @@ mod searchable; mod vectors; use bumpalo::Bump; -pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; +pub use cache::{ + merge_caches, merge_caches_alt, transpose_and_freeze_caches, BalancedCaches, + DelAddRoaringBitmap, +}; pub use documents::*; pub use faceted::*; pub use geo::*; diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index b650b6b53..9f2aae5a8 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::{ - merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, - GeoExtractorData, + merge_caches, merge_caches_alt, transpose_and_freeze_caches, BalancedCaches, + DelAddRoaringBitmap, FacetKind, GeoExtractorData, }; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; @@ -78,7 +78,7 @@ where if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_alt(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { From cb99ac6f7eddef97bb4386987b3151ecd40219f4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 17:00:22 +0100 Subject: [PATCH 087/158] Consume vec instead of draining --- crates/milli/src/update/new/extract/cache.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index ae5ade17e..b57ba6b9b 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -652,7 +652,7 @@ where break; } - for (key, bbbul) in ordered_buffer.drain(..) { + for (key, bbbul) in ordered_buffer { // Make sure we don't try to work with entries already managed by the spilled if bbbul.is_empty() { continue; From 2e32d0474ccc846bbe86c0bbafd88368f82e8a3e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 17:05:07 +0100 Subject: [PATCH 088/158] Lexicographically sort all the map to merge --- crates/milli/src/update/new/extract/cache.rs | 38 +++++++------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index b57ba6b9b..325a72280 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -640,36 +640,24 @@ where } // Then manage the content on the HashMap entries that weren't taken (mem::take). - let order_count = 1000; while let Some(mut map) = maps.pop() { - let mut iter = map.iter_mut(); + // Make sure we don't try to work with entries already managed by the spilled + let mut ordered_entries: Vec<_> = + map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect(); + ordered_entries.sort_unstable_by_key(|(key, _)| *key); - loop { - let mut ordered_buffer: Vec<_> = iter.by_ref().take(order_count).collect(); - ordered_buffer.sort_unstable_by_key(|(key, _)| *key); + for (key, bbbul) in ordered_entries { + let mut output = DelAddRoaringBitmap::empty(); + output.union_and_clear_bbbul(bbbul); - if ordered_buffer.is_empty() { - break; + for rhs in maps.iter_mut() { + if let Some(new) = rhs.get_mut(key) { + output.union_and_clear_bbbul(new); + } } - for (key, bbbul) in ordered_buffer { - // Make sure we don't try to work with entries already managed by the spilled - if bbbul.is_empty() { - continue; - } - - let mut output = DelAddRoaringBitmap::empty(); - output.union_and_clear_bbbul(bbbul); - - for rhs in maps.iter_mut() { - if let Some(new) = rhs.get_mut(key) { - output.union_and_clear_bbbul(new); - } - } - - // We send the merged entry outside. - (f)(key, output)?; - } + // We send the merged entry outside. + (f)(key, output)?; } } From 2da5584bb555a564c382774bd4ad03ae39184ddb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 17:39:07 +0100 Subject: [PATCH 089/158] Make the tasks pulling timeout configurable --- crates/xtask/src/bench/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index fdb2c4963..891742528 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -82,6 +82,10 @@ pub struct BenchDeriveArgs { /// Reason for the benchmark invocation #[arg(short, long)] reason: Option, + + /// The maximum time in seconds we allow for fetching the task queue before timing out. + #[arg(long, default_value_t = 60)] + tasks_queue_timeout_secs: u64, } pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { @@ -127,7 +131,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let meili_client = Client::new( Some("http://127.0.0.1:7700".into()), args.master_key.as_deref(), - Some(std::time::Duration::from_secs(60)), + Some(std::time::Duration::from_secs(args.tasks_queue_timeout_secs)), )?; // enter runtime From d0c4e6da6bceb7d079c1a29ac5d95d796a63810c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Dec 2024 14:32:45 +0100 Subject: [PATCH 090/158] Make clippy happy --- crates/xtask/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/xtask/src/main.rs b/crates/xtask/src/main.rs index b81424666..942362f4f 100644 --- a/crates/xtask/src/main.rs +++ b/crates/xtask/src/main.rs @@ -16,6 +16,7 @@ struct ListFeaturesDeriveArgs { #[command(author, version, about, long_about)] #[command(name = "cargo xtask")] #[command(bin_name = "cargo xtask")] +#[allow(clippy::large_enum_variant)] // please, that's enough... enum Command { ListFeatures(ListFeaturesDeriveArgs), Bench(BenchDeriveArgs), From 5f896b1050ebef939ab68b8ba569193278d61ebb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 4 Dec 2024 17:51:12 +0100 Subject: [PATCH 091/158] Fix geo when spilling --- .../milli/src/update/new/extract/geo/mod.rs | 28 +++++++++++-------- crates/milli/src/update/new/merger.rs | 4 +-- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 09d2ce0f8..a3820609d 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -1,6 +1,6 @@ use std::cell::RefCell; use std::fs::File; -use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; +use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; use std::{iter, mem, result}; use bumpalo::Bump; @@ -97,30 +97,34 @@ pub struct FrozenGeoExtractorData<'extractor> { impl<'extractor> FrozenGeoExtractorData<'extractor> { pub fn iter_and_clear_removed( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.removed) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.removed) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)?)) } pub fn iter_and_clear_inserted( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.inserted) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.inserted) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)?)) } } fn iterator_over_spilled_geopoints( spilled: &mut Option>, -) -> impl IntoIterator> + '_ { +) -> io::Result> + '_> { let mut spilled = spilled.take(); - iter::from_fn(move || match &mut spilled { + if let Some(spilled) = &mut spilled { + spilled.rewind()?; + } + + Ok(iter::from_fn(move || match &mut spilled { Some(file) => { let geopoint_bytes = &mut [0u8; mem::size_of::()]; match file.read_exact(geopoint_bytes) { @@ -130,7 +134,7 @@ fn iterator_over_spilled_geopoints( } } None => None, - }) + })) } impl<'extractor> Extractor<'extractor> for GeoExtractor { @@ -157,7 +161,9 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { let mut data_ref = context.data.borrow_mut_or_yield(); for change in changes { - if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) { + if data_ref.spilled_removed.is_none() + && max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) + { // We must spill as we allocated too much memory data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?; data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?; diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index b650b6b53..512e094fb 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -34,7 +34,7 @@ where } let mut frozen = data.into_inner().freeze()?; - for result in frozen.iter_and_clear_removed() { + for result in frozen.iter_and_clear_removed()? { let extracted_geo_point = result?; let removed = rtree.remove(&GeoPoint::from(extracted_geo_point)); debug_assert!(removed.is_some()); @@ -42,7 +42,7 @@ where debug_assert!(removed); } - for result in frozen.iter_and_clear_inserted() { + for result in frozen.iter_and_clear_inserted()? { let extracted_geo_point = result?; rtree.insert(GeoPoint::from(extracted_geo_point)); let inserted = faceted.insert(extracted_geo_point.docid); From 3a11e39c010d474129e1c4816c61d9f96bdead00 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 4 Dec 2024 17:52:53 +0100 Subject: [PATCH 092/158] Force max_memory to a min of 100MiB --- crates/milli/src/update/new/indexer/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 383823de1..9ee7577a5 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -88,7 +88,9 @@ where // 2% of the indexing memory let total_bbbuffer_capacity = (max_memory / 100 / 2).max(minimum_capacity); let new_grenad_parameters = GrenadParameters { - max_memory: Some(max_memory.saturating_sub(total_bbbuffer_capacity)), + max_memory: Some( + max_memory.saturating_sub(total_bbbuffer_capacity).max(100 * 1024 * 1024), + ), ..grenad_parameters }; (new_grenad_parameters, total_bbbuffer_capacity) From 52843123d49d5b8a7903a9c6f95ae584f7e87a8c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 5 Dec 2024 10:03:05 +0100 Subject: [PATCH 093/158] Clean up and remove the non-sorted merge_caches function --- crates/milli/src/update/new/extract/cache.rs | 103 +------------------ crates/milli/src/update/new/extract/mod.rs | 3 +- crates/milli/src/update/new/merger.rs | 8 +- 3 files changed, 8 insertions(+), 106 deletions(-) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 325a72280..658a3127c 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -466,110 +466,13 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>( Ok(bucket_caches) } -/// Merges the caches that must be all associated to the same bucket. +/// Merges the caches that must be all associated to the same bucket +/// but make sure to sort the different buckets before performing the merges. /// /// # Panics /// /// - If the bucket IDs in these frozen caches are not exactly the same. -pub fn merge_caches(frozen: Vec, mut f: F) -> Result<()> -where - F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, -{ - let mut maps = Vec::new(); - let mut readers = Vec::new(); - let mut current_bucket = None; - for FrozenCache { bucket, cache, ref mut spilled } in frozen { - assert_eq!(*current_bucket.get_or_insert(bucket), bucket); - maps.push(cache); - readers.append(spilled); - } - - // First manage the spilled entries by looking into the HashMaps, - // merge them and mark them as dummy. - let mut heap = BinaryHeap::new(); - for (source_index, source) in readers.into_iter().enumerate() { - let mut cursor = source.into_cursor()?; - if cursor.move_on_next()?.is_some() { - heap.push(Entry { cursor, source_index }); - } - } - - loop { - let mut first_entry = match heap.pop() { - Some(entry) => entry, - None => break, - }; - - let (first_key, first_value) = match first_entry.cursor.current() { - Some((key, value)) => (key, value), - None => break, - }; - - let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; - while let Some(mut entry) = heap.peek_mut() { - if let Some((key, _value)) = entry.cursor.current() { - if first_key == key { - let new = DelAddRoaringBitmap::from_bytes(first_value)?; - output = output.merge(new); - // When we are done we the current value of this entry move make - // it move forward and let the heap reorganize itself (on drop) - if entry.cursor.move_on_next()?.is_none() { - PeekMut::pop(entry); - } - } else { - break; - } - } - } - - // Once we merged all of the spilled bitmaps we must also - // fetch the entries from the non-spilled entries (the HashMaps). - for (map_index, map) in maps.iter_mut().enumerate() { - if first_entry.source_index != map_index { - if let Some(new) = map.get_mut(first_key) { - output.union_and_clear_bbbul(new); - } - } - } - - // We send the merged entry outside. - (f)(first_key, output)?; - - // Don't forget to put the first entry back into the heap. - if first_entry.cursor.move_on_next()?.is_some() { - heap.push(first_entry) - } - } - - // Then manage the content on the HashMap entries that weren't taken (mem::take). - while let Some(mut map) = maps.pop() { - for (key, bbbul) in map.iter_mut() { - // Make sure we don't try to work with entries already managed by the spilled - if bbbul.is_empty() { - continue; - } - - let mut output = DelAddRoaringBitmap::empty(); - output.union_and_clear_bbbul(bbbul); - - for rhs in maps.iter_mut() { - if let Some(new) = rhs.get_mut(key) { - output.union_and_clear_bbbul(new); - } - } - - // We send the merged entry outside. - (f)(key, output)?; - } - } - - Ok(()) -} - -/// Merges the caches that must be all associated to the same bucket. -/// -/// It merges entries like the `merge_caches` function -pub fn merge_caches_alt(frozen: Vec, mut f: F) -> Result<()> +pub fn merge_caches_sorted(frozen: Vec, mut f: F) -> Result<()> where F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, { diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 3601dd9c6..0bdf31635 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -7,8 +7,7 @@ mod vectors; use bumpalo::Bump; pub use cache::{ - merge_caches, merge_caches_alt, transpose_and_freeze_caches, BalancedCaches, - DelAddRoaringBitmap, + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, }; pub use documents::*; pub use faceted::*; diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 9f2aae5a8..85f5a70f7 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::{ - merge_caches, merge_caches_alt, transpose_and_freeze_caches, BalancedCaches, - DelAddRoaringBitmap, FacetKind, GeoExtractorData, + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, + FacetKind, GeoExtractorData, }; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; @@ -78,7 +78,7 @@ where if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - merge_caches_alt(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { @@ -107,7 +107,7 @@ pub fn merge_and_send_facet_docids<'extractor>( .map(|frozen| { let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); let rtxn = index.read_txn()?; - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { From 9020a50df89de88d79528663869562d892d1ad4f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 5 Dec 2024 10:14:46 +0100 Subject: [PATCH 094/158] Change the default max memory usage to 5% of the total memory --- crates/meilisearch/src/option.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 7e87a5a2c..7c59f0607 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -654,8 +654,9 @@ impl Opt { #[derive(Debug, Default, Clone, Parser, Deserialize)] pub struct IndexerOpts { - /// Sets the maximum amount of RAM Meilisearch can use when indexing. By default, Meilisearch - /// uses no more than two thirds of available memory. + /// Specifies the maximum resident memory that Meilisearch can use for indexing. + /// By default, Meilisearch limits the RAM usage to 5% of the total available memory. + /// Note that the underlying store utilizes memory-mapping and makes use of the rest. #[clap(long, env = MEILI_MAX_INDEXING_MEMORY, default_value_t)] #[serde(default)] pub max_indexing_memory: MaxMemory, @@ -714,7 +715,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { } } -/// A type used to detect the max memory available and use 2/3 of it. +/// A type used to detect the max resident memory available and use 5% of it. #[derive(Debug, Clone, Copy, Deserialize, Serialize)] pub struct MaxMemory(Option); @@ -728,7 +729,7 @@ impl FromStr for MaxMemory { impl Default for MaxMemory { fn default() -> MaxMemory { - MaxMemory(total_memory_bytes().map(|bytes| bytes * 2 / 3).map(Byte::from_u64)) + MaxMemory(total_memory_bytes().map(|bytes| bytes * 5 / 100).map(Byte::from_u64)) } } From 95975944d70ff23ade0210218a09aed6a05f3dbb Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 5 Dec 2024 14:23:38 +0100 Subject: [PATCH 095/158] fix the dumps missing the empty swap index tasks --- crates/index-scheduler/src/batch.rs | 3 ++- .../lib.rs/swap_indexes/third_empty_swap_processed.snap | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index fc6fb194c..cc730e286 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -496,7 +496,7 @@ impl IndexScheduler { // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; - let task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; // If the task is not associated with any index, verify that it is an index swap and // create the batch directly. Otherwise, get the index name associated with the task @@ -506,6 +506,7 @@ impl IndexScheduler { index_name } else { assert!(matches!(&task.kind, KindWithContent::IndexSwap { swaps } if swaps.is_empty())); + current_batch.processing(Some(&mut task)); return Ok(Some((Batch::IndexSwap { task }, current_batch))); }; diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap b/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap index 0f126b33a..77b1193a5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap @@ -89,7 +89,7 @@ succeeded [0,1,2,3,4,5,6,] ---------------------------------------------------------------------- ### Batches Kind: "indexCreation" [0,1,2,3,] -"indexSwap" [4,5,] +"indexSwap" [4,5,6,] ---------------------------------------------------------------------- ### Batches Index Tasks: a [0,4,5,] @@ -104,6 +104,7 @@ d [3,4,] [timestamp] [3,] [timestamp] [4,] [timestamp] [5,] +[timestamp] [6,] ---------------------------------------------------------------------- ### Batches Started At: [timestamp] [0,] From 214b51de879d977e6c167287195ad59185c05a75 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 5 Dec 2024 14:45:54 +0100 Subject: [PATCH 096/158] try to fix the snapshot on demand flaky test --- crates/meilisearch/tests/common/mod.rs | 19 +++++++++++++++++++ crates/meilisearch/tests/snapshot/mod.rs | 8 ++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch/tests/common/mod.rs b/crates/meilisearch/tests/common/mod.rs index 3aae2fe80..44385752e 100644 --- a/crates/meilisearch/tests/common/mod.rs +++ b/crates/meilisearch/tests/common/mod.rs @@ -52,6 +52,25 @@ impl Value { } self } + + /// Return `true` if the `status` field is set to `failed`. + /// Panic if the `status` field doesn't exists. + #[track_caller] + pub fn is_fail(&self) -> bool { + if !self["status"].is_string() { + panic!("Called `is_fail` on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self["status"] == serde_json::Value::String(String::from("failed")) + } + + // Panic if the json doesn't contain the `status` field set to "succeeded" + #[track_caller] + pub fn failed(&self) -> &Self { + if !self.is_fail() { + panic!("Called failed on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self + } } impl From for Value { diff --git a/crates/meilisearch/tests/snapshot/mod.rs b/crates/meilisearch/tests/snapshot/mod.rs index 976551190..0d569fc7c 100644 --- a/crates/meilisearch/tests/snapshot/mod.rs +++ b/crates/meilisearch/tests/snapshot/mod.rs @@ -129,11 +129,11 @@ async fn perform_on_demand_snapshot() { index.load_test_set().await; - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.succeeded(); - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.failed(); let (task, code) = server.create_snapshot().await; snapshot!(code, @"202 Accepted"); From a0a3b55700aaf6bd633d57b494d53ce66df0bf72 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 5 Dec 2024 14:48:29 +0100 Subject: [PATCH 097/158] Change error code --- crates/meilisearch-types/src/error.rs | 2 +- crates/meilisearch/src/search/mod.rs | 2 +- crates/meilisearch/tests/search/facet_search.rs | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index c68059682..afc876b42 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -279,7 +279,7 @@ InvalidSearchPage , InvalidRequest , BAD_REQUEST ; InvalidSearchQ , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchQuery , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; -InvalidFacetSearchDisabled , InvalidRequest , BAD_REQUEST ; +FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 9e0c936b7..7beaad6a5 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1410,7 +1410,7 @@ pub fn perform_facet_search( if !index.facet_search(&rtxn)? { return Err(ResponseError::from_msg( "The facet search is disabled for this index".to_string(), - Code::InvalidFacetSearchDisabled, + Code::FacetSearchDisabled, )); } diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 23f312490..19224c3df 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -225,9 +225,9 @@ async fn add_documents_and_deactivate_facet_search() { snapshot!(response, @r###" { "message": "The facet search is disabled for this index", - "code": "invalid_facet_search_disabled", + "code": "facet_search_disabled", "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_facet_search_disabled" + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" } "###); } @@ -256,9 +256,9 @@ async fn deactivate_facet_search_and_add_documents() { snapshot!(response, @r###" { "message": "The facet search is disabled for this index", - "code": "invalid_facet_search_disabled", + "code": "facet_search_disabled", "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_facet_search_disabled" + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" } "###); } From c77073efcca508df72eede587401fd94235cfd4c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:50:12 +0100 Subject: [PATCH 098/158] Update::has_changed_for_fields --- .../milli/src/update/new/document_change.rs | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 899655db1..1644b2254 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,7 +1,10 @@ use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::document::{ + Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, +}; +use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; @@ -164,6 +167,80 @@ impl<'doc> Update<'doc> { } } + /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. + /// + /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. + /// Otherwise `false`. + pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( + &self, + fields: Option<&[&str]>, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let mut changed = false; + let mut cached_current = None; + let mut updated_selected_field_count = 0; + + for entry in self.updated().iter_top_level_fields() { + let (key, updated_value) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + + updated_selected_field_count += 1; + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + let current_value = current.top_level_field(key)?; + let Some(current_value) = current_value else { + changed = true; + break; + }; + + if current_value.get() != updated_value.get() { + changed = true; + break; + } + cached_current = Some(current); + } + + if !self.has_deletion { + // no field deletion, so fields that don't appear in `updated` cannot have changed + return Ok(changed); + } + + if changed { + return Ok(true); + } + + // we saw all updated fields, and set `changed` if any field wasn't in `current`. + // so if there are as many fields in `current` as in `updated`, then nothing changed. + // If there is any more fields in `current`, then they are missing in `updated`. + let has_deleted_fields = { + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + + let mut current_selected_field_count = 0; + for entry in current.iter_top_level_fields() { + let (key, _) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + current_selected_field_count += 1; + } + + current_selected_field_count != updated_selected_field_count + }; + + Ok(has_deleted_fields) + } + pub fn updated_vectors( &self, doc_alloc: &'doc Bump, From c77b00d3ac9c9ed893ae7be940a57eebd3efd338 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:51:58 +0100 Subject: [PATCH 099/158] Don't extract word docids when no searchable changed --- .../new/extract/searchable/extract_word_docids.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 05e2374dc..39f67e417 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,8 +8,9 @@ use bumpalo::Bump; use heed::RoTxn; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use crate::update::new::document::Document as _; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::extract::perm_json_p::contained_in; +use crate::update::new::extract::perm_json_p::{self, contained_in}; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; @@ -351,6 +352,15 @@ impl WordDocidsExtractors { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + &context.rtxn, + context.index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let mut token_fn = |fname: &str, fid, pos, word: &str| { cached_sorter.insert_del_u32( fid, From 2b74d1824bca13d92757433391fd0f7ad0fabb43 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:56:22 +0100 Subject: [PATCH 100/158] Ignore documents that didn't change any field in word pair proximity --- .../searchable/extract_word_pair_proximity_docids.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..e58c0efd2 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, From fa8b9acdf6aa932d9a5421ff4f4347f39b280a5e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 16:12:52 +0100 Subject: [PATCH 101/158] Ignore documents that didn't change in facets --- .../src/update/new/extract/faceted/extract_facets.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index f2132ce38..b865d0a35 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -97,6 +97,15 @@ impl FacetedDocidsExtractor { }, ), DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + Some(attributes_to_extract), + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, From bd5110a2fed4c36f8d57c8eba2de94367e750c96 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 16:13:07 +0100 Subject: [PATCH 102/158] Fix clippy warnings --- .../src/update/new/extract/searchable/extract_word_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 39f67e417..06fb747c6 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,9 +8,8 @@ use bumpalo::Bump; use heed::RoTxn; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use crate::update::new::document::Document as _; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::extract::perm_json_p::{self, contained_in}; +use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; From 95ed07976146fdab8a3be7000ce9cc1adbc9d726 Mon Sep 17 00:00:00 2001 From: airycanon Date: Fri, 22 Nov 2024 14:11:56 +0800 Subject: [PATCH 103/158] attach index name in errors # Conflicts: # crates/index-scheduler/src/batch.rs # Conflicts: # crates/index-scheduler/src/batch.rs # crates/meilisearch/src/search/mod.rs --- crates/index-scheduler/src/error.rs | 15 +++++++++---- .../src/index_mapper/index_map.rs | 4 ++-- .../index-scheduler/src/index_mapper/mod.rs | 13 ++++++------ crates/index-scheduler/src/lib.rs | 21 ++++++++++++------- crates/meilisearch/src/error.rs | 16 +++++++++++--- crates/meilisearch/src/lib.rs | 3 ++- .../src/routes/indexes/facet_search.rs | 2 +- crates/meilisearch/src/routes/indexes/mod.rs | 4 ++-- .../meilisearch/src/routes/indexes/search.rs | 14 +++++++------ .../meilisearch/src/routes/indexes/similar.rs | 2 +- crates/meilisearch/src/routes/multi_search.rs | 6 ++++-- crates/meilisearch/src/search/federated.rs | 7 ++++--- 12 files changed, 68 insertions(+), 39 deletions(-) diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index f6a4ecc04..82388172e 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -122,8 +122,11 @@ pub enum Error { Dump(#[from] dump::Error), #[error(transparent)] Heed(#[from] heed::Error), - #[error(transparent)] - Milli(#[from] milli::Error), + #[error("{}", match .index_name { + Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), + _ => format!("{error}") + })] + Milli { error: milli::Error, index_name: Option }, #[error("An unexpected crash occurred when processing the task.")] ProcessBatchPanicked, #[error(transparent)] @@ -190,7 +193,7 @@ impl Error { | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) - | Error::Milli(_) + | Error::Milli { .. } | Error::ProcessBatchPanicked | Error::FileStore(_) | Error::IoError(_) @@ -209,6 +212,10 @@ impl Error { pub fn with_custom_error_code(self, code: Code) -> Self { Self::WithCustomErrorCode(code, Box::new(self)) } + + pub fn from_milli(error: milli::Error, index_name: Option) -> Self { + Self::Milli { error, index_name } + } } impl ErrorCode for Error { @@ -236,7 +243,7 @@ impl ErrorCode for Error { // TODO: not sure of the Code to use Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice, Error::Dump(e) => e.error_code(), - Error::Milli(e) => e.error_code(), + Error::Milli { error, .. } => error.error_code(), Error::ProcessBatchPanicked => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), diff --git a/crates/index-scheduler/src/index_mapper/index_map.rs b/crates/index-scheduler/src/index_mapper/index_map.rs index f8080d23b..c20782068 100644 --- a/crates/index-scheduler/src/index_mapper/index_map.rs +++ b/crates/index-scheduler/src/index_mapper/index_map.rs @@ -3,13 +3,13 @@ use std::path::Path; use std::time::Duration; use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions}; -use meilisearch_types::milli::Index; +use meilisearch_types::milli::{Index, Result}; use time::OffsetDateTime; use uuid::Uuid; use super::IndexStatus::{self, Available, BeingDeleted, Closing, Missing}; use crate::lru::{InsertionOutcome, LruMap}; -use crate::{clamp_to_page_size, Result}; +use crate::{clamp_to_page_size}; /// Keep an internally consistent view of the open indexes in memory. /// diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 3cccb5a69..500e4cf83 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use tracing::error; use uuid::Uuid; - +use meilisearch_types::milli; use self::index_map::IndexMap; use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; use crate::uuid_codec::UuidCodec; @@ -121,7 +121,7 @@ impl IndexStats { /// # Parameters /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. - pub fn new(index: &Index, rtxn: &RoTxn) -> Result { + pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { Ok(IndexStats { number_of_documents: index.number_of_documents(rtxn)?, database_size: index.on_disk_size()?, @@ -189,7 +189,7 @@ impl IndexMapper { date, self.enable_mdb_writemap, self.index_base_map_size, - )?; + ).map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; wtxn.commit()?; @@ -357,7 +357,8 @@ impl IndexMapper { }; let index_path = self.base_path.join(uuid.to_string()); // take the lock to reopen the environment. - reopen.reopen(&mut self.index_map.write().unwrap(), &index_path)?; + reopen.reopen(&mut self.index_map.write().unwrap(), &index_path) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; continue; } BeingDeleted => return Err(Error::IndexNotFound(name.to_string())), @@ -378,7 +379,7 @@ impl IndexMapper { None, self.enable_mdb_writemap, self.index_base_map_size, - )?; + ).map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; } Available(index) => break index, Closing(_) => { @@ -459,7 +460,7 @@ impl IndexMapper { None => { let index = self.index(rtxn, index_uid)?; let index_rtxn = index.read_txn()?; - IndexStats::new(&index, &index_rtxn) + IndexStats::new(&index, &index_rtxn).map_err(|e| Error::from_milli(e, Some(uuid.to_string()))) } } } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 9405ecf24..6147f788f 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1678,9 +1678,9 @@ impl IndexScheduler { tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); } // If we have an abortion error we must stop the tick here and re-schedule tasks. - Err(Error::Milli(milli::Error::InternalError( - milli::InternalError::AbortedIndexation, - ))) + Err(Error::Milli{ + error: milli::Error::InternalError(milli::InternalError::AbortedIndexation), .. + }) | Err(Error::AbortedTask) => { #[cfg(test)] self.breakpoint(Breakpoint::AbortedIndexation); @@ -1699,9 +1699,9 @@ impl IndexScheduler { // 2. close the associated environment // 3. resize it // 4. re-schedule tasks - Err(Error::Milli(milli::Error::UserError( - milli::UserError::MaxDatabaseSizeReached, - ))) if index_uid.is_some() => { + Err(Error::Milli { + error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached), .. + }) if index_uid.is_some() => { // fixme: add index_uid to match to avoid the unwrap let index_uid = index_uid.unwrap(); // fixme: handle error more gracefully? not sure when this could happen @@ -1943,6 +1943,7 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, + index_uid: String, embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs @@ -1954,7 +1955,10 @@ impl IndexScheduler { .. }| { let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + Arc::new(prompt.try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))? + ); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); @@ -1970,7 +1974,8 @@ impl IndexScheduler { let embedder = Arc::new( Embedder::new(embedder_options.clone()) .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, + .map_err(meilisearch_types::milli::Error::from) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, ); { let mut embedders = self.embedders.write().unwrap(); diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index 5c4ce171f..6e7283a18 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -7,6 +7,7 @@ use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError}; use meilisearch_types::milli::OrderBy; use serde_json::Value; use tokio::task::JoinError; +use meilisearch_types::milli; #[derive(Debug, thiserror::Error)] pub enum MeilisearchHttpError { @@ -62,8 +63,11 @@ pub enum MeilisearchHttpError { HeedError(#[from] meilisearch_types::heed::Error), #[error(transparent)] IndexScheduler(#[from] index_scheduler::Error), - #[error(transparent)] - Milli(#[from] meilisearch_types::milli::Error), + #[error("{}", match .index_name { + Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), + _ => format!("{error}") + })] + Milli { error: meilisearch_types::milli::Error, index_name: Option }, #[error(transparent)] Payload(#[from] PayloadError), #[error(transparent)] @@ -76,6 +80,12 @@ pub enum MeilisearchHttpError { MissingSearchHybrid, } +impl MeilisearchHttpError { + pub(crate) fn from_milli(error: milli::Error, index_name: Option) -> Self { + Self::Milli { error, index_name } + } +} + impl ErrorCode for MeilisearchHttpError { fn error_code(&self) -> Code { match self { @@ -95,7 +105,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::SerdeJson(_) => Code::Internal, MeilisearchHttpError::HeedError(_) => Code::Internal, MeilisearchHttpError::IndexScheduler(e) => e.error_code(), - MeilisearchHttpError::Milli(e) => e.error_code(), + MeilisearchHttpError::Milli{error, ..} => error.error_code(), MeilisearchHttpError::Payload(e) => e.error_code(), MeilisearchHttpError::FileStore(_) => Code::Internal, MeilisearchHttpError::DocumentFormat(e) => e.error_code(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 633ad2776..779af63f2 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -395,6 +395,7 @@ fn import_dump( for index_reader in dump_reader.indexes()? { let mut index_reader = index_reader?; let metadata = index_reader.metadata(); + let uid = metadata.uid.clone(); tracing::info!("Importing index `{}`.", metadata.uid); let date = Some((metadata.created_at, metadata.updated_at)); @@ -432,7 +433,7 @@ fn import_dump( let reader = DocumentsBatchReader::from_reader(reader)?; let embedder_configs = index.embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let embedders = index_scheduler.embedders(uid, embedder_configs)?; let builder = milli::update::IndexDocuments::new( &mut wtxn, diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 99a4a4f28..fc29d3406 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -185,7 +185,7 @@ pub async fn search( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&search_query, &index_scheduler, &index, features)?; + let search_kind = search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { perform_facet_search( diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 7d073ec5f..1dda27a98 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -5,7 +5,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::{DeserializeError, Deserr, ValuePointerRef}; -use index_scheduler::IndexScheduler; +use index_scheduler::{Error, IndexScheduler}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{immutable_field_error, DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; @@ -107,7 +107,7 @@ pub async fn list_indexes( if !filters.is_index_authorized(uid) { return Ok(None); } - Ok(Some(IndexView::new(uid.to_string(), index)?)) + Ok(Some(IndexView::new(uid.to_string(), index).map_err(|e| Error::from_milli(e, Some(uid.to_string())))?)) })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. let indexes: Vec = indexes.into_iter().flatten().collect(); diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 2f5cb4a36..609439b4a 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -243,11 +243,11 @@ pub async fn search_with_url_query( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, index_scheduler.features()) + perform_search(index_uid.to_string(), &index, query, search_kind, retrieve_vector, index_scheduler.features()) }) .await; permit.drop().await; @@ -287,12 +287,12 @@ pub async fn search_with_post( let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vectors, index_scheduler.features()) + perform_search(index_uid.to_string(), &index, query, search_kind, retrieve_vectors, index_scheduler.features()) }) .await; permit.drop().await; @@ -314,6 +314,7 @@ pub async fn search_with_post( pub fn search_kind( query: &SearchQuery, index_scheduler: &IndexScheduler, + index_uid: String, index: &milli::Index, features: RoFeatures, ) -> Result { @@ -332,7 +333,7 @@ pub fn search_kind( (None, _, None) => Ok(SearchKind::KeywordOnly), // hybrid.semantic_ratio == 1.0 => vector (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index, embedder, v.map(|v| v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } // hybrid.semantic_ratio == 0.0 => keyword (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { @@ -340,13 +341,14 @@ pub fn search_kind( } // no query, hybrid, vector => semantic (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index, embedder, Some(v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) } // query, no hybrid, no vector => keyword (Some(_), None, None) => Ok(SearchKind::KeywordOnly), // query, hybrid, maybe vector => hybrid (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, + index_uid, index, embedder, **semantic_ratio, diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index 79f42f0aa..a0fccff52 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -104,7 +104,7 @@ async fn similar( let index = index_scheduler.index(&index_uid)?; let (embedder_name, embedder, quantized) = - SearchKind::embedder(&index_scheduler, &index, &query.embedder, None)?; + SearchKind::embedder(&index_scheduler, index_uid.to_string(), &index, &query.embedder, None)?; tokio::task::spawn_blocking(move || { perform_similar( diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index f8b1bc6ee..c4496e41c 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -125,14 +125,16 @@ pub async fn multi_search_with_post( }) .with_index(query_index)?; + let index_uid_str = index_uid.to_string(); + let search_kind = - search_kind(&query, index_scheduler.get_ref(), &index, features) + search_kind(&query, index_scheduler.get_ref(), index_uid_str.clone(), &index, features) .with_index(query_index)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features) .with_index(query_index)?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, features) + perform_search(index_uid_str.clone(), &index, query, search_kind, retrieve_vector, features) }) .await .with_index(query_index)?; diff --git a/crates/meilisearch/src/search/federated.rs b/crates/meilisearch/src/search/federated.rs index 5279c26bb..5aae82c66 100644 --- a/crates/meilisearch/src/search/federated.rs +++ b/crates/meilisearch/src/search/federated.rs @@ -560,7 +560,7 @@ pub fn perform_federated_search( // use an immediately invoked lambda to capture the result without returning from the function let res: Result<(), ResponseError> = (|| { - let search_kind = search_kind(&query, index_scheduler, &index, features)?; + let search_kind = search_kind(&query, index_scheduler, index_uid.to_string(), &index, features)?; let canonicalization_kind = match (&search_kind, &query.q) { (SearchKind::SemanticOnly { .. }, _) => { @@ -636,7 +636,7 @@ pub fn perform_federated_search( search.offset(0); search.limit(required_hit_count); - let (result, _semantic_hit_count) = super::search_from_kind(search_kind, search)?; + let (result, _semantic_hit_count) = super::search_from_kind(index_uid.to_string(), search_kind, search)?; let format = AttributesFormat { attributes_to_retrieve: query.attributes_to_retrieve, retrieve_vectors, @@ -670,7 +670,8 @@ pub fn perform_federated_search( let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder)?; + let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder) + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; results_by_query.push(SearchResultByQuery { federation_options, From b75f1f4c17c3cd8608227ea11219018feca69960 Mon Sep 17 00:00:00 2001 From: airycanon Date: Fri, 22 Nov 2024 14:19:20 +0800 Subject: [PATCH 104/158] fix tests # Conflicts: # crates/index-scheduler/src/batch.rs # crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap # crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap # Conflicts: # crates/index-scheduler/src/batch.rs # crates/meilisearch/src/search/mod.rs # crates/meilisearch/tests/vector/mod.rs # Conflicts: # crates/index-scheduler/src/batch.rs --- crates/index-scheduler/src/batch.rs | 210 ++++++++++-------- crates/index-scheduler/src/error.rs | 23 +- .../src/index_mapper/index_map.rs | 3 +- .../index-scheduler/src/index_mapper/mod.rs | 51 +++-- crates/index-scheduler/src/lib.rs | 24 +- .../after_removing_the_documents.snap | 4 +- crates/meilisearch/src/error.rs | 6 +- .../src/routes/indexes/facet_search.rs | 3 +- crates/meilisearch/src/routes/indexes/mod.rs | 5 +- .../meilisearch/src/routes/indexes/search.rs | 24 +- .../meilisearch/src/routes/indexes/similar.rs | 9 +- crates/meilisearch/src/routes/multi_search.rs | 20 +- crates/meilisearch/src/search/federated.rs | 12 +- crates/meilisearch/src/search/mod.rs | 55 +++-- .../tests/documents/add_documents.rs | 32 +-- crates/meilisearch/tests/documents/errors.rs | 4 +- .../meilisearch/tests/index/update_index.rs | 2 +- crates/meilisearch/tests/search/errors.rs | 104 ++++----- crates/meilisearch/tests/search/multi.rs | 12 +- crates/meilisearch/tests/tasks/mod.rs | 2 +- .../tests/vector/binary_quantized.rs | 2 +- crates/meilisearch/tests/vector/mod.rs | 24 +- crates/meilisearch/tests/vector/openai.rs | 4 +- crates/meilisearch/tests/vector/rest.rs | 24 +- 24 files changed, 378 insertions(+), 281 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index cc730e286..9a3ba4929 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -29,7 +29,6 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; use dump::IndexMetadata; use meilisearch_types::batches::BatchId; -use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; @@ -689,7 +688,9 @@ impl IndexScheduler { let index = self.index_mapper.index(&rtxn, name)?; let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); fs::create_dir_all(&dst)?; - index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; + index + .copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled) + .map_err(|e| Error::from_milli(e, Some(name.to_string())))?; } drop(rtxn); @@ -791,16 +792,19 @@ impl IndexScheduler { let content_file = self.file_store.get_update(content_file)?; let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(milli::Error::from)?; + .map_err(|e| Error::from_milli(e.into(), None))?; let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - while let Some(doc) = - cursor.next_document().map_err(milli::Error::from)? + while let Some(doc) = cursor + .next_document() + .map_err(|e| Error::from_milli(e.into(), None))? { - dump_content_file - .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; + dump_content_file.push_document( + &obkv_to_object(doc, &documents_batch_index) + .map_err(|e| Error::from_milli(e, None))?, + )?; } dump_content_file.flush()?; } @@ -814,27 +818,41 @@ impl IndexScheduler { let metadata = IndexMetadata { uid: uid.to_owned(), primary_key: index.primary_key(&rtxn)?.map(String::from), - created_at: index.created_at(&rtxn)?, - updated_at: index.updated_at(&rtxn)?, + created_at: index + .created_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + updated_at: index + .updated_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, }; let mut index_dumper = dump.create_index(uid, &metadata)?; let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; + let embedding_configs = index + .embedding_configs(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let documents = index + .all_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // 3.1. Dump the documents - for ret in index.all_documents(&rtxn)? { + for ret in documents { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } - let (id, doc) = ret?; + let (id, doc) = + ret.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + let mut document = + milli::obkv_to_json(&all_fields, &fields_ids_map, doc) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; 'inject_vectors: { - let embeddings = index.embeddings(&rtxn, id)?; + let embeddings = index + .embeddings(&rtxn, id) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; if embeddings.is_empty() { break 'inject_vectors; @@ -845,7 +863,7 @@ impl IndexScheduler { .or_insert(serde_json::Value::Object(Default::default())); let serde_json::Value::Object(vectors) = vectors else { - return Err(milli::Error::UserError( + let user_err = milli::Error::UserError( milli::UserError::InvalidVectorsMapType { document_id: { if let Ok(Some(Ok(index))) = index @@ -859,8 +877,9 @@ impl IndexScheduler { }, value: vectors.clone(), }, - ) - .into()); + ); + + return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; for (embedder_name, embeddings) in embeddings { @@ -890,7 +909,8 @@ impl IndexScheduler { index, &rtxn, meilisearch_types::settings::SecretPolicy::RevealSecrets, - )?; + ) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; index_dumper.settings(&settings)?; Ok(()) })?; @@ -946,7 +966,8 @@ impl IndexScheduler { // the entire batch. let res = || -> Result<()> { let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; let mut wtxn = self.env.write_txn()?; self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; wtxn.commit()?; @@ -988,10 +1009,12 @@ impl IndexScheduler { ); builder.set_primary_key(primary_key); let must_stop_processing = self.must_stop_processing.clone(); - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; } @@ -1008,7 +1031,8 @@ impl IndexScheduler { let res = || -> Result<()> { let mut wtxn = self.env.write_txn()?; let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; wtxn.commit()?; Ok(()) @@ -1031,7 +1055,9 @@ impl IndexScheduler { let number_of_documents = || -> Result { let index = self.index_mapper.index(&wtxn, &index_uid)?; let index_rtxn = index.read_txn()?; - Ok(index.number_of_documents(&index_rtxn)?) + index + .number_of_documents(&index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string()))) }() .unwrap_or_default(); @@ -1188,8 +1214,10 @@ impl IndexScheduler { }; match operation { - IndexOperation::DocumentClear { mut tasks, .. } => { - let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; + IndexOperation::DocumentClear { index_uid, mut tasks } => { + let count = milli::update::ClearDocuments::new(index_wtxn, index) + .execute() + .map_err(|e| Error::from_milli(e, Some(index_uid)))?; let mut first_clear_found = false; for task in &mut tasks { @@ -1209,7 +1237,7 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentOperation { - index_uid: _, + index_uid, primary_key, method, operations, @@ -1235,13 +1263,17 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; for operation in operations { match operation { DocumentOperation::Add(_content_uuid) => { let mmap = content_files_iter.next().unwrap(); - indexer.add_documents(mmap)?; + indexer + .add_documents(mmap) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; } DocumentOperation::Delete(document_ids) => { let document_ids: bumpalo::collections::vec::Vec<_> = document_ids @@ -1266,15 +1298,17 @@ impl IndexScheduler { } }; - let (document_changes, operation_stats, primary_key) = indexer.into_changes( - &indexer_alloc, - index, - &rtxn, - primary_key.as_deref(), - &mut new_fields_ids_map, - &|| must_stop_processing.get(), - &send_progress, - )?; + let (document_changes, operation_stats, primary_key) = indexer + .into_changes( + &indexer_alloc, + index, + &rtxn, + primary_key.as_deref(), + &mut new_fields_ids_map, + &|| must_stop_processing.get(), + &send_progress, + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; let mut addition = 0; for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { @@ -1321,14 +1355,15 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), &send_progress, - )?; + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } Ok(tasks) } - IndexOperation::DocumentEdition { mut task, .. } => { + IndexOperation::DocumentEdition { index_uid, mut task } => { let (filter, code) = if let KindWithContent::DocumentEdition { filter_expr, context: _, @@ -1342,16 +1377,11 @@ impl IndexScheduler { }; let candidates = match filter.as_ref().map(Filter::from_json) { - Some(Ok(Some(filter))) => { - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { - Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) - } - e => e.into(), - })? - } + Some(Ok(Some(filter))) => filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, None | Some(Ok(None)) => index.documents_ids(index_wtxn)?, - Some(Err(e)) => return Err(e.into()), + Some(Err(e)) => return Err(Error::from_milli(e, Some(index_uid.clone()))), }; let (original_filter, context, function) = if let Some(Details::DocumentEdition { @@ -1386,8 +1416,9 @@ impl IndexScheduler { // candidates not empty => index not empty => a primary key is set let primary_key = index.primary_key(&rtxn)?.unwrap(); - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; @@ -1406,11 +1437,17 @@ impl IndexScheduler { }; let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = - pool.install(|| indexer.into_changes(&primary_key)).unwrap()?; - - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let document_changes = pool + .install(|| { + indexer + .into_changes(&primary_key) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))) + }) + .unwrap()?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; indexer::index( index_wtxn, @@ -1424,7 +1461,8 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), &send_progress, - )?; + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1455,7 +1493,7 @@ impl IndexScheduler { Ok(vec![task]) } - IndexOperation::DocumentDeletion { mut tasks, index_uid: _ } => { + IndexOperation::DocumentDeletion { mut tasks, index_uid } => { let mut to_delete = RoaringBitmap::new(); let external_documents_ids = index.external_documents_ids(); @@ -1476,35 +1514,23 @@ impl IndexScheduler { deleted_documents: Some(will_be_removed), }); } - KindWithContent::DocumentDeletionByFilter { index_uid: _, filter_expr } => { + KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } => { let before = to_delete.len(); let filter = match Filter::from_json(filter_expr) { Ok(filter) => filter, Err(err) => { // theorically, this should be catched by deserr before reaching the index-scheduler and cannot happens task.status = Status::Failed; - task.error = match err { - milli::Error::UserError( - milli::UserError::InvalidFilterExpression { .. }, - ) => Some( - Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter) - .into(), - ), - e => Some(e.into()), - }; + task.error = Some( + Error::from_milli(err, Some(index_uid.clone())).into(), + ); None } }; if let Some(filter) = filter { - let candidates = - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError( - milli::UserError::InvalidFilter(_), - ) => Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter), - e => e.into(), - }); + let candidates = filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))); match candidates { Ok(candidates) => to_delete |= candidates, Err(err) => { @@ -1540,8 +1566,9 @@ impl IndexScheduler { // to_delete not empty => index not empty => primary key set let primary_key = index.primary_key(&rtxn)?.unwrap(); - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; if !tasks.iter().all(|res| res.error.is_some()) { let local_pool; @@ -1560,8 +1587,10 @@ impl IndexScheduler { let mut indexer = indexer::DocumentDeletion::new(); indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; indexer::index( index_wtxn, @@ -1575,14 +1604,15 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), &send_progress, - )?; + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } Ok(tasks) } - IndexOperation::Settings { index_uid: _, settings, mut tasks } => { + IndexOperation::Settings { index_uid, settings, mut tasks } => { let indexer_config = self.index_mapper.indexer_config(); let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); @@ -1596,10 +1626,12 @@ impl IndexScheduler { task.status = Status::Succeeded; } - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; Ok(tasks) } diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 82388172e..5fb04828c 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -1,13 +1,12 @@ use std::fmt::Display; +use crate::TaskId; use meilisearch_types::batches::BatchId; use meilisearch_types::error::{Code, ErrorCode}; use meilisearch_types::tasks::{Kind, Status}; use meilisearch_types::{heed, milli}; use thiserror::Error; -use crate::TaskId; - #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DateField { BeforeEnqueuedAt, @@ -122,11 +121,11 @@ pub enum Error { Dump(#[from] dump::Error), #[error(transparent)] Heed(#[from] heed::Error), - #[error("{}", match .index_name { - Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), + #[error("{}", match .index_uid { + Some(uid) if !uid.is_empty() => format!("Index `{}`: {error}", uid), _ => format!("{error}") })] - Milli { error: milli::Error, index_name: Option }, + Milli { error: milli::Error, index_uid: Option }, #[error("An unexpected crash occurred when processing the task.")] ProcessBatchPanicked, #[error(transparent)] @@ -213,8 +212,18 @@ impl Error { Self::WithCustomErrorCode(code, Box::new(self)) } - pub fn from_milli(error: milli::Error, index_name: Option) -> Self { - Self::Milli { error, index_name } + pub fn from_milli(err: milli::Error, index_uid: Option) -> Self { + match err { + milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + milli::Error::UserError(milli::UserError::InvalidFilterExpression { .. }) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + _ => Self::Milli { error: err, index_uid }, + } } } diff --git a/crates/index-scheduler/src/index_mapper/index_map.rs b/crates/index-scheduler/src/index_mapper/index_map.rs index c20782068..480dafa7c 100644 --- a/crates/index-scheduler/src/index_mapper/index_map.rs +++ b/crates/index-scheduler/src/index_mapper/index_map.rs @@ -8,9 +8,8 @@ use time::OffsetDateTime; use uuid::Uuid; use super::IndexStatus::{self, Available, BeingDeleted, Closing, Missing}; +use crate::clamp_to_page_size; use crate::lru::{InsertionOutcome, LruMap}; -use crate::{clamp_to_page_size}; - /// Keep an internally consistent view of the open indexes in memory. /// /// This view is made of an LRU cache that will evict the least frequently used indexes when new indexes are opened. diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 500e4cf83..8b9ef3597 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -3,19 +3,19 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use std::{fs, thread}; +use self::index_map::IndexMap; +use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; +use crate::uuid_codec::UuidCodec; +use crate::{Error, Result}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; +use meilisearch_types::milli; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use tracing::error; use uuid::Uuid; -use meilisearch_types::milli; -use self::index_map::IndexMap; -use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; -use crate::uuid_codec::UuidCodec; -use crate::{Error, Result}; mod index_map; @@ -183,13 +183,18 @@ impl IndexMapper { // Error if the UUIDv4 somehow already exists in the map, since it should be fresh. // This is very unlikely to happen in practice. // TODO: it would be better to lazily create the index. But we need an Index::open function for milli. - let index = self.index_map.write().unwrap().create( - &uuid, - &index_path, - date, - self.enable_mdb_writemap, - self.index_base_map_size, - ).map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; + let index = self + .index_map + .write() + .unwrap() + .create( + &uuid, + &index_path, + date, + self.enable_mdb_writemap, + self.index_base_map_size, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; wtxn.commit()?; @@ -357,7 +362,8 @@ impl IndexMapper { }; let index_path = self.base_path.join(uuid.to_string()); // take the lock to reopen the environment. - reopen.reopen(&mut self.index_map.write().unwrap(), &index_path) + reopen + .reopen(&mut self.index_map.write().unwrap(), &index_path) .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; continue; } @@ -373,13 +379,15 @@ impl IndexMapper { Missing => { let index_path = self.base_path.join(uuid.to_string()); - break index_map.create( - &uuid, - &index_path, - None, - self.enable_mdb_writemap, - self.index_base_map_size, - ).map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; + break index_map + .create( + &uuid, + &index_path, + None, + self.enable_mdb_writemap, + self.index_base_map_size, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; } Available(index) => break index, Closing(_) => { @@ -460,7 +468,8 @@ impl IndexMapper { None => { let index = self.index(rtxn, index_uid)?; let index_rtxn = index.read_txn()?; - IndexStats::new(&index, &index_rtxn).map_err(|e| Error::from_milli(e, Some(uuid.to_string()))) + IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string()))) } } } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 6147f788f..e780b21a1 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1678,8 +1678,9 @@ impl IndexScheduler { tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); } // If we have an abortion error we must stop the tick here and re-schedule tasks. - Err(Error::Milli{ - error: milli::Error::InternalError(milli::InternalError::AbortedIndexation), .. + Err(Error::Milli { + error: milli::Error::InternalError(milli::InternalError::AbortedIndexation), + .. }) | Err(Error::AbortedTask) => { #[cfg(test)] @@ -1700,7 +1701,8 @@ impl IndexScheduler { // 3. resize it // 4. re-schedule tasks Err(Error::Milli { - error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached), .. + error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached), + .. }) if index_uid.is_some() => { // fixme: add index_uid to match to avoid the unwrap let index_uid = index_uid.unwrap(); @@ -1954,11 +1956,12 @@ impl IndexScheduler { config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, .. }| { - let prompt = - Arc::new(prompt.try_into() + let prompt = Arc::new( + prompt + .try_into() .map_err(meilisearch_types::milli::Error::from) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))? - ); + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); @@ -1974,8 +1977,9 @@ impl IndexScheduler { let embedder = Arc::new( Embedder::new(embedder_options.clone()) .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, + .map_err(|err| { + Error::from_milli(err.into(), Some(index_uid.clone())) + })?, ); { let mut embedders = self.embedders.write().unwrap(); @@ -6176,7 +6180,7 @@ mod tests { insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); - let configs = index_scheduler.embedders(configs).unwrap(); + let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap(); diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 492eae3dd..0ee4d91e5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -9,8 +9,8 @@ source: crates/index-scheduler/src/lib.rs 0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} -3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} -4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} +3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} +4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} 5 {uid: 5, batch_uid: 2, status: succeeded, details: { original_filter: "catto EXISTS", deleted_documents: Some(1) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("catto EXISTS") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index 6e7283a18..41d62507a 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -4,10 +4,10 @@ use byte_unit::{Byte, UnitType}; use meilisearch_types::document_formats::{DocumentFormatError, PayloadType}; use meilisearch_types::error::{Code, ErrorCode, ResponseError}; use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError}; +use meilisearch_types::milli; use meilisearch_types::milli::OrderBy; use serde_json::Value; use tokio::task::JoinError; -use meilisearch_types::milli; #[derive(Debug, thiserror::Error)] pub enum MeilisearchHttpError { @@ -67,7 +67,7 @@ pub enum MeilisearchHttpError { Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), _ => format!("{error}") })] - Milli { error: meilisearch_types::milli::Error, index_name: Option }, + Milli { error: milli::Error, index_name: Option }, #[error(transparent)] Payload(#[from] PayloadError), #[error(transparent)] @@ -105,7 +105,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::SerdeJson(_) => Code::Internal, MeilisearchHttpError::HeedError(_) => Code::Internal, MeilisearchHttpError::IndexScheduler(e) => e.error_code(), - MeilisearchHttpError::Milli{error, ..} => error.error_code(), + MeilisearchHttpError::Milli { error, .. } => error.error_code(), MeilisearchHttpError::Payload(e) => e.error_code(), MeilisearchHttpError::FileStore(_) => Code::Internal, MeilisearchHttpError::DocumentFormat(e) => e.error_code(), diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index fc29d3406..ff11f1305 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -185,7 +185,8 @@ pub async fn search( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index, features)?; + let search_kind = + search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { perform_facet_search( diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 1dda27a98..b2a85335b 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -107,7 +107,10 @@ pub async fn list_indexes( if !filters.is_index_authorized(uid) { return Ok(None); } - Ok(Some(IndexView::new(uid.to_string(), index).map_err(|e| Error::from_milli(e, Some(uid.to_string())))?)) + Ok(Some( + IndexView::new(uid.to_string(), index) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + )) })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. let indexes: Vec = indexes.into_iter().flatten().collect(); diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 609439b4a..fbaac67da 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -243,11 +243,19 @@ pub async fn search_with_url_query( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(index_uid.to_string(), &index, query, search_kind, retrieve_vector, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vector, + index_scheduler.features(), + ) }) .await; permit.drop().await; @@ -287,12 +295,20 @@ pub async fn search_with_post( let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(index_uid.to_string(), &index, query, search_kind, retrieve_vectors, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vectors, + index_scheduler.features(), + ) }) .await; permit.drop().await; diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index a0fccff52..f47771061 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -103,8 +103,13 @@ async fn similar( let index = index_scheduler.index(&index_uid)?; - let (embedder_name, embedder, quantized) = - SearchKind::embedder(&index_scheduler, index_uid.to_string(), &index, &query.embedder, None)?; + let (embedder_name, embedder, quantized) = SearchKind::embedder( + &index_scheduler, + index_uid.to_string(), + &index, + &query.embedder, + None, + )?; tokio::task::spawn_blocking(move || { perform_similar( diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index c4496e41c..a2db0b22b 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -127,14 +127,26 @@ pub async fn multi_search_with_post( let index_uid_str = index_uid.to_string(); - let search_kind = - search_kind(&query, index_scheduler.get_ref(), index_uid_str.clone(), &index, features) - .with_index(query_index)?; + let search_kind = search_kind( + &query, + index_scheduler.get_ref(), + index_uid_str.clone(), + &index, + features, + ) + .with_index(query_index)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features) .with_index(query_index)?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(index_uid_str.clone(), &index, query, search_kind, retrieve_vector, features) + perform_search( + index_uid_str.clone(), + &index, + query, + search_kind, + retrieve_vector, + features, + ) }) .await .with_index(query_index)?; diff --git a/crates/meilisearch/src/search/federated.rs b/crates/meilisearch/src/search/federated.rs index 5aae82c66..c1c6bb7d7 100644 --- a/crates/meilisearch/src/search/federated.rs +++ b/crates/meilisearch/src/search/federated.rs @@ -560,7 +560,8 @@ pub fn perform_federated_search( // use an immediately invoked lambda to capture the result without returning from the function let res: Result<(), ResponseError> = (|| { - let search_kind = search_kind(&query, index_scheduler, index_uid.to_string(), &index, features)?; + let search_kind = + search_kind(&query, index_scheduler, index_uid.to_string(), &index, features)?; let canonicalization_kind = match (&search_kind, &query.q) { (SearchKind::SemanticOnly { .. }, _) => { @@ -636,7 +637,8 @@ pub fn perform_federated_search( search.offset(0); search.limit(required_hit_count); - let (result, _semantic_hit_count) = super::search_from_kind(index_uid.to_string(), search_kind, search)?; + let (result, _semantic_hit_count) = + super::search_from_kind(index_uid.to_string(), search_kind, search)?; let format = AttributesFormat { attributes_to_retrieve: query.attributes_to_retrieve, retrieve_vectors, @@ -670,8 +672,10 @@ pub fn perform_federated_search( let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder) - .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; + let hit_maker = + HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { + MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) + })?; results_by_query.push(SearchResultByQuery { federation_options, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 7beaad6a5..674ae226b 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -19,7 +19,9 @@ use meilisearch_types::locales::Locale; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; -use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; +use meilisearch_types::milli::{ + FacetValueHit, InternalError, OrderBy, SearchForFacetValues, TimeBudget, +}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; use milli::tokenizer::{Language, TokenizerBuilder}; @@ -281,35 +283,38 @@ pub enum SearchKind { impl SearchKind { pub(crate) fn semantic( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, ) -> Result { let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?; Ok(Self::SemanticOnly { embedder_name, embedder, quantized }) } pub(crate) fn hybrid( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, semantic_ratio: f32, vector_len: Option, ) -> Result { let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?; Ok(Self::Hybrid { embedder_name, embedder, quantized, semantic_ratio }) } pub(crate) fn embedder( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, ) -> Result<(String, Arc, bool), ResponseError> { let embedder_configs = index.embedding_configs(&index.read_txn()?)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; let (embedder, _, quantized) = embedders .get(embedder_name) @@ -890,6 +895,7 @@ fn prepare_search<'t>( } pub fn perform_search( + index_uid: String, index: &Index, query: SearchQuery, search_kind: SearchKind, @@ -916,7 +922,7 @@ pub fn perform_search( used_negative_operator, }, semantic_hit_count, - ) = search_from_kind(search_kind, search)?; + ) = search_from_kind(index_uid, search_kind, search)?; let SearchQuery { q, @@ -1069,17 +1075,27 @@ fn compute_facet_distribution_stats>( } pub fn search_from_kind( + index_uid: String, search_kind: SearchKind, search: milli::Search<'_>, ) -> Result<(milli::SearchResult, Option), MeilisearchHttpError> { let (milli_result, semantic_hit_count) = match &search_kind { - SearchKind::KeywordOnly => (search.execute()?, None), + SearchKind::KeywordOnly => { + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; + (results, None) + } SearchKind::SemanticOnly { .. } => { - let results = search.execute()?; + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; let semantic_hit_count = results.document_scores.len() as u32; (results, Some(semantic_hit_count)) } - SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, + SearchKind::Hybrid { semantic_ratio, .. } => search + .execute_hybrid(*semantic_ratio) + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid)))?, }; Ok((milli_result, semantic_hit_count)) } @@ -1181,7 +1197,7 @@ impl<'a> HitMaker<'a> { rtxn: &'a RoTxn<'a>, format: AttributesFormat, mut formatter_builder: MatcherBuilder<'a>, - ) -> Result { + ) -> milli::Result { formatter_builder.crop_marker(format.crop_marker); formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); @@ -1276,11 +1292,7 @@ impl<'a> HitMaker<'a> { }) } - pub fn make_hit( - &self, - id: u32, - score: &[ScoreDetails], - ) -> Result { + pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { let (_, obkv) = self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; @@ -1323,7 +1335,10 @@ impl<'a> HitMaker<'a> { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; - vectors.insert(name, serde_json::to_value(embeddings)?); + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, + ); } document.insert("_vectors".into(), vectors.into()); } @@ -1369,7 +1384,7 @@ fn make_hits<'a>( format: AttributesFormat, matching_words: milli::MatchingWords, documents_ids_scores: impl Iterator)> + 'a, -) -> Result, MeilisearchHttpError> { +) -> milli::Result> { let mut documents = Vec::new(); let dictionary = index.dictionary(rtxn)?; @@ -1697,12 +1712,12 @@ fn make_document( displayed_attributes: &BTreeSet, field_ids_map: &FieldsIdsMap, obkv: &obkv::KvReaderU16, -) -> Result { +) -> milli::Result { let mut document = serde_json::Map::new(); // recreate the original json for (key, value) in obkv.iter() { - let value = serde_json::from_slice(value)?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; let key = field_ids_map.name(key).expect("Missing field name").to_string(); document.insert(key, value); @@ -1727,7 +1742,7 @@ fn format_fields( displayable_ids: &BTreeSet, locales: Option<&[Language]>, localized_attributes: &[LocalizedAttributesRule], -) -> Result<(Option, Document), MeilisearchHttpError> { +) -> milli::Result<(Option, Document)> { let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); @@ -1905,7 +1920,7 @@ fn parse_filter_array(arr: &[Value]) -> Result, MeilisearchHttpEr } } - Ok(Filter::from_array(ands)?) + Filter::from_array(ands).map_err(|e| MeilisearchHttpError::from_milli(e, None)) } #[cfg(test)] diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index eebc5dc63..750bf7ae9 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1681,7 +1681,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1719,7 +1719,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1757,7 +1757,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1795,7 +1795,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1833,7 +1833,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1871,7 +1871,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1909,7 +1909,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1947,7 +1947,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1985,7 +1985,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2023,7 +2023,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2061,7 +2061,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2099,7 +2099,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2138,7 +2138,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2175,7 +2175,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2212,7 +2212,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2279,7 +2279,7 @@ async fn add_invalid_geo_and_then_settings() { ] }, "error": { - "message": "Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" diff --git a/crates/meilisearch/tests/documents/errors.rs b/crates/meilisearch/tests/documents/errors.rs index c90b9ed49..1e361fefb 100644 --- a/crates/meilisearch/tests/documents/errors.rs +++ b/crates/meilisearch/tests/documents/errors.rs @@ -604,7 +604,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `EMPTY_INDEX`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -636,7 +636,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"catto = jorts\"" }, "error": { - "message": "Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", + "message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" diff --git a/crates/meilisearch/tests/index/update_index.rs b/crates/meilisearch/tests/index/update_index.rs index 36ec27306..f991c3580 100644 --- a/crates/meilisearch/tests/index/update_index.rs +++ b/crates/meilisearch/tests/index/update_index.rs @@ -95,7 +95,7 @@ async fn error_update_existing_primary_key() { let response = index.wait_task(2).await; let expected_response = json!({ - "message": "Index already has a primary key: `id`.", + "message": "Index `test`: Index already has a primary key: `id`.", "code": "index_primary_key_already_exists", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#index_primary_key_already_exists" diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index 6840f8fba..ab50e2aa1 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -711,7 +711,7 @@ async fn filter_invalid_attribute_array() { index.wait_task(task.uid()).await; let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", + "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -733,7 +733,7 @@ async fn filter_invalid_attribute_string() { index.wait_task(task.uid()).await; let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", + "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -940,7 +940,7 @@ async fn sort_unsortable_attribute() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "Attribute `title` is not sortable. Available sortable attributes are: `id`.", + "message": format!("Index `{}`: Attribute `title` is not sortable. Available sortable attributes are: `id`.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -998,7 +998,7 @@ async fn sort_unset_ranking_rule() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", + "message": format!("Index `{}`: You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1024,19 +1024,18 @@ async fn search_on_unknown_field() { index.update_settings_searchable_attributes(json!(["id", "title"])).await; index.wait_task(response.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + }); index .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1050,19 +1049,18 @@ async fn search_on_unknown_field_plus_joker() { index.update_settings_searchable_attributes(json!(["id", "title"])).await; index.wait_task(response.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + }); index .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1071,15 +1069,8 @@ async fn search_on_unknown_field_plus_joker() { .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1092,47 +1083,44 @@ async fn distinct_at_search_time() { let (task, _) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; index.wait_task(task.uid()).await; + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; index.wait_task(task.uid()).await; + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await; diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi.rs index 8d7340f0d..9377f435a 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi.rs @@ -1070,7 +1070,7 @@ async fn federation_one_query_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1102,7 +1102,7 @@ async fn federation_one_query_sort_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1166,7 +1166,7 @@ async fn federation_multiple_query_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1198,7 +1198,7 @@ async fn federation_multiple_query_sort_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1231,7 +1231,7 @@ async fn federation_multiple_query_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1264,7 +1264,7 @@ async fn federation_multiple_query_sort_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" diff --git a/crates/meilisearch/tests/tasks/mod.rs b/crates/meilisearch/tests/tasks/mod.rs index fc05ee4ca..c9d3f31ed 100644 --- a/crates/meilisearch/tests/tasks/mod.rs +++ b/crates/meilisearch/tests/tasks/mod.rs @@ -448,7 +448,7 @@ async fn test_summarized_delete_documents_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `test`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" diff --git a/crates/meilisearch/tests/vector/binary_quantized.rs b/crates/meilisearch/tests/vector/binary_quantized.rs index 560c4e2f2..790df5459 100644 --- a/crates/meilisearch/tests/vector/binary_quantized.rs +++ b/crates/meilisearch/tests/vector/binary_quantized.rs @@ -318,7 +318,7 @@ async fn try_to_disable_binary_quantization() { } }, "error": { - "message": "`.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", + "message": "Index `doggo`: `.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index bb20d7b2a..adad9fa81 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -250,7 +250,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -280,7 +280,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -311,7 +311,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -340,7 +340,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -369,7 +369,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -398,7 +398,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -440,7 +440,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -469,7 +469,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -498,7 +498,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -539,7 +539,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -569,7 +569,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -599,7 +599,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 99aa1f710..b02111639 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -713,7 +713,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -757,7 +757,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index cadc54f24..bf6876fbe 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -985,7 +985,7 @@ async fn bad_settings() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1025,7 +1025,7 @@ async fn bad_settings() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1178,7 +1178,7 @@ async fn server_returns_bad_request() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1247,7 +1247,7 @@ async fn server_returns_bad_request() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1306,7 +1306,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1362,7 +1362,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1414,7 +1414,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1478,7 +1478,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1542,7 +1542,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1908,7 +1908,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1951,7 +1951,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -2099,7 +2099,7 @@ async fn searchable_reindex() { ] }, "error": { - "message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" From 08f2c696b0c663c8a668586448e3986d47c41f04 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 9 Dec 2024 09:35:51 +0100 Subject: [PATCH 105/158] Allow xtask bench to proceed without a commit message --- crates/xtask/src/bench/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index 891742528..deec120fa 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -139,7 +139,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { rt.block_on(async { dashboard_client.send_machine_info(&env).await?; - let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); + let commit_message = build_info.commit_msg.unwrap_or_default().split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?; From bcfed7088863746e096c0d17f5c6b19b6d57ffb8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 9 Dec 2024 10:08:02 +0100 Subject: [PATCH 106/158] Revert "Merge #5125" This reverts commit 9a9383643f9a6b5ee9ab2ace3e9d63b920d94a53, reversing changes made to cac355bfa7e72ca3c5c02cacb4f2fcd3f2dd336e. --- crates/meilisearch/src/option.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 7c59f0607..7e87a5a2c 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -654,9 +654,8 @@ impl Opt { #[derive(Debug, Default, Clone, Parser, Deserialize)] pub struct IndexerOpts { - /// Specifies the maximum resident memory that Meilisearch can use for indexing. - /// By default, Meilisearch limits the RAM usage to 5% of the total available memory. - /// Note that the underlying store utilizes memory-mapping and makes use of the rest. + /// Sets the maximum amount of RAM Meilisearch can use when indexing. By default, Meilisearch + /// uses no more than two thirds of available memory. #[clap(long, env = MEILI_MAX_INDEXING_MEMORY, default_value_t)] #[serde(default)] pub max_indexing_memory: MaxMemory, @@ -715,7 +714,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { } } -/// A type used to detect the max resident memory available and use 5% of it. +/// A type used to detect the max memory available and use 2/3 of it. #[derive(Debug, Clone, Copy, Deserialize, Serialize)] pub struct MaxMemory(Option); @@ -729,7 +728,7 @@ impl FromStr for MaxMemory { impl Default for MaxMemory { fn default() -> MaxMemory { - MaxMemory(total_memory_bytes().map(|bytes| bytes * 5 / 100).map(Byte::from_u64)) + MaxMemory(total_memory_bytes().map(|bytes| bytes * 2 / 3).map(Byte::from_u64)) } } From f5dd8dfc3e57cfa36fc3ccbefe73de0706a156fd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 9 Dec 2024 10:26:30 +0100 Subject: [PATCH 107/158] Rollback max memory usage changes --- crates/milli/src/update/new/indexer/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 9ee7577a5..59088bd47 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -80,6 +80,15 @@ where let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); + // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch + // is because we still use the old indexer for the settings and it is highly impacted by the + // max memory. So we keep the changes here and will remove these changes once we use the new + // indexer to also index settings. Related to #5125 and #5141. + let grenad_parameters = GrenadParameters { + max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), + ..grenad_parameters + }; + // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( From 71f59749dca59bec6119da76cef5d984864b43fb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 9 Dec 2024 15:44:06 +0100 Subject: [PATCH 108/158] Reduce union impact in merging --- crates/milli/src/update/new/merger.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 9728f99d6..9e87388a2 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -235,8 +235,12 @@ fn merge_cbo_bitmaps( (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), (Some(current), Some(del), add) => { + debug_assert!( + del.is_subset(¤t), + "del is not a subset of current, which must be impossible." + ); let output = match add { - Some(add) => (¤t - del) | add, + Some(add) => (¤t - (&del - &add)) | (add - del), None => ¤t - del, }; if output.is_empty() { From 07f42e805712fde3087829d9400e767384de7a7f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 9 Dec 2024 15:45:12 +0100 Subject: [PATCH 109/158] Do not index a filed count when no word is counted --- .../extract/searchable/extract_word_docids.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 06fb747c6..5e85eb1c8 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> { exact_word_docids: BalancedCaches<'extractor>, word_position_docids: BalancedCaches<'extractor>, fid_word_count_docids: BalancedCaches<'extractor>, - fid_word_count: HashMap, + fid_word_count: HashMap, Option)>, current_docid: Option, } @@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(_current_count, new_count)| *new_count += 1) - .or_insert((0, 1)); + .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) + .or_insert((None, Some(1))); self.current_docid = Some(docid); Ok(()) @@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(current_count, _new_count)| *current_count += 1) - .or_insert((1, 0)); + .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) + .or_insert((Some(1), None)); self.current_docid = Some(docid); @@ -141,14 +141,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { - if current_count <= MAX_COUNTED_WORDS { + if let Some(current_count) = + current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(current_count as u8); self.fid_word_count_docids .insert_del_u32(buffer, self.current_docid.unwrap())?; } - if new_count <= MAX_COUNTED_WORDS { + if let Some(new_count) = + new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(new_count as u8); From 7cf6707ed3d19ff38819f2b824c546b3d64f960b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 10 Dec 2024 11:05:42 +0100 Subject: [PATCH 110/158] Extend test to add the ==512 bytes case --- .../tests/documents/add_documents.rs | 85 ++++++++++++++++++- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index 750bf7ae9..d72b1a7a8 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1264,15 +1264,18 @@ async fn error_add_documents_bad_document_id() { let server = Server::new().await; let index = server.index("test"); index.create(Some("docid")).await; + + // unsupported characters + let documents = json!([ { "docid": "foo & bar", "content": "foobar" } ]); - index.add_documents(documents, None).await; - index.wait_task(1).await; - let (response, code) = index.get_task(1).await; + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1288,7 +1291,81 @@ async fn error_add_documents_bad_document_id() { "indexedDocuments": 0 }, "error": { - "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // More than 512 bytes + let documents = json!([ + { + "docid": "a".repeat(600), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 2, + "batchUid": 2, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Exactly 512 bytes + let documents = json!([ + { + "docid": "a".repeat(512), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 3, + "batchUid": 3, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_document_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_id" From e610af36aadb429c4cba3599d15e22463ba21e3c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 10 Dec 2024 11:06:24 +0100 Subject: [PATCH 111/158] User failure for documents with docid of ==512 bytes --- crates/milli/src/documents/primary_key.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/documents/primary_key.rs b/crates/milli/src/documents/primary_key.rs index fb8b3d027..c1dd9a9b8 100644 --- a/crates/milli/src/documents/primary_key.rs +++ b/crates/milli/src/documents/primary_key.rs @@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool { pub fn validate_document_id_str(document_id: &str) -> Option<&str> { if document_id.is_empty() - || document_id.len() > 512 + || document_id.len() >= 512 || !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') { None From 866ac91be3c38d83535e2b3b58a3b90238fa8960 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 10 Dec 2024 11:06:58 +0100 Subject: [PATCH 112/158] Fix error messages --- crates/index-scheduler/src/error.rs | 5 +++-- crates/meilisearch-types/src/error.rs | 2 +- crates/milli/src/error.rs | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 5fb04828c..f6ee1f685 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -1,12 +1,13 @@ use std::fmt::Display; -use crate::TaskId; use meilisearch_types::batches::BatchId; use meilisearch_types::error::{Code, ErrorCode}; use meilisearch_types::tasks::{Kind, Status}; use meilisearch_types::{heed, milli}; use thiserror::Error; +use crate::TaskId; + #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DateField { BeforeEnqueuedAt, @@ -103,7 +104,7 @@ pub enum Error { )] InvalidTaskCanceledBy { canceled_by: String }, #[error( - "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes." + "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 400 bytes." )] InvalidIndexUid { index_uid: String }, #[error("Task `{0}` not found.")] diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index afc876b42..0c4027899 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -550,7 +550,7 @@ impl fmt::Display for deserr_codes::InvalidSimilarId { "the value of `id` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ - and can not be more than 512 bytes." + and can not be more than 511 bytes." ) } } diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index a6774a7bd..2bd57bba5 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -114,7 +114,7 @@ pub enum UserError { "Document identifier `{}` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ -and can not be more than 512 bytes.", .document_id.to_string() +and can not be more than 511 bytes.", .document_id.to_string() )] InvalidDocumentId { document_id: Value }, #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] From 89637bcaafc43a353d825a7478b3c3b58111e5d8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 11:12:27 +0100 Subject: [PATCH 113/158] Use bumparaw-collections in Meilisearch/milli --- Cargo.lock | 33 ++++++----- crates/index-scheduler/Cargo.toml | 6 +- crates/meilisearch-types/Cargo.toml | 2 +- .../meilisearch-types/src/document_formats.rs | 2 +- crates/milli/Cargo.toml | 2 +- crates/milli/src/prompt/document.rs | 59 ++++++++++--------- crates/milli/src/update/new/document.rs | 2 +- crates/milli/src/update/new/extract/cache.rs | 6 +- .../extract/searchable/tokenize_document.rs | 2 +- crates/milli/src/update/new/indexer/de.rs | 11 ++-- .../update/new/indexer/document_operation.rs | 12 ++-- crates/milli/src/update/new/indexer/mod.rs | 2 +- .../src/update/new/indexer/partial_dump.rs | 5 +- .../update/new/indexer/update_by_function.rs | 2 +- .../milli/src/update/new/vector_document.rs | 2 +- 15 files changed, 78 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3c2fb711e..a57391bfc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -706,6 +706,20 @@ dependencies = [ "serde", ] +[[package]] +name = "bumparaw-collections" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7495aa71334069997d1b4ff536a4a01542981774a1654d4dfb00f29db3aedcef" +dependencies = [ + "allocator-api2", + "bitpacking", + "bumpalo", + "hashbrown 0.15.1", + "serde", + "serde_json", +] + [[package]] name = "byte-unit" version = "5.1.4" @@ -2617,6 +2631,7 @@ dependencies = [ "big_s", "bincode", "bumpalo", + "bumparaw-collections", "crossbeam-channel", "csv", "derive_builder 0.20.0", @@ -2631,7 +2646,6 @@ dependencies = [ "meilisearch-types", "memmap2", "page_size", - "raw-collections", "rayon", "roaring", "serde", @@ -3549,6 +3563,7 @@ dependencies = [ "actix-web", "anyhow", "bumpalo", + "bumparaw-collections", "convert_case 0.6.0", "csv", "deserr", @@ -3561,7 +3576,6 @@ dependencies = [ "meili-snap", "memmap2", "milli", - "raw-collections", "roaring", "serde", "serde-cs", @@ -3618,6 +3632,7 @@ dependencies = [ "bincode", "bstr", "bumpalo", + "bumparaw-collections", "bytemuck", "byteorder", "candle-core", @@ -3656,7 +3671,6 @@ dependencies = [ "once_cell", "ordered-float", "rand", - "raw-collections", "rayon", "rayon-par-bridge", "rhai", @@ -4487,19 +4501,6 @@ dependencies = [ "rand", ] -[[package]] -name = "raw-collections" -version = "0.1.0" -source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a" -dependencies = [ - "allocator-api2", - "bitpacking", - "bumpalo", - "hashbrown 0.15.1", - "serde", - "serde_json", -] - [[package]] name = "raw-cpuid" version = "10.7.0" diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index ad4c1b4b9..a2b9debec 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -13,6 +13,8 @@ license.workspace = true [dependencies] anyhow = "1.0.86" bincode = "1.3.3" +bumpalo = "3.16.0" +bumparaw-collections = "0.1.1" csv = "1.3.0" derive_builder = "0.20.0" dump = { path = "../dump" } @@ -21,8 +23,8 @@ file-store = { path = "../file-store" } flate2 = "1.0.30" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.4" page_size = "0.6.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } @@ -30,7 +32,6 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" tempfile = "3.10.1" thiserror = "1.0.61" -memmap2 = "0.9.4" time = { version = "0.3.36", features = [ "serde-well-known", "formatting", @@ -40,7 +41,6 @@ time = { version = "0.3.36", features = [ tracing = "0.1.40" ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } -bumpalo = "3.16.0" [dev-dependencies] arroy = "0.5.0" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index aca06a018..b91689ed7 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } +bumparaw-collections = "0.1.1" roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 008be4022..c6e8ad907 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -4,10 +4,10 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use bumpalo::Bump; +use bumparaw_collections::RawMap; use memmap2::Mmap; use milli::documents::Error; use milli::Object; -use raw_collections::RawMap; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 2a959b654..ae1edd168 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -91,8 +91,8 @@ ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.15.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } bumpalo = "3.16.0" +bumparaw-collections = "0.1.1" thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index dea7946da..5232b6788 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -3,12 +3,12 @@ use std::collections::BTreeMap; use std::fmt::{self, Debug}; use bumpalo::Bump; +use bumparaw_collections::{RawMap, RawVec, Value}; use liquid::model::{ ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; -use raw_collections::{RawMap, RawVec}; use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; @@ -245,12 +245,12 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, #[derive(Debug)] struct ParseableValue<'doc> { - value: raw_collections::Value<'doc>, + value: Value<'doc>, } impl<'doc> ParseableValue<'doc> { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { - let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); + let value = Value::from_raw_value(value, doc_alloc).unwrap(); Self { value } } @@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn render(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.render(), Value::Bool(v) => v.render(), @@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn source(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.source(), Value::Bool(v) => ValueView::source(v), @@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn type_name(&self) -> &'static str { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.type_name(), Value::Bool(v) => v.type_name(), @@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn query_state(&self, state: State) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::query_state(&LiquidValue::Nil, state), Value::Bool(v) => ValueView::query_state(v, state), @@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_kstr(&self) -> KStringCow<'_> { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Bool(v) => ValueView::to_kstr(v), @@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_value(&self) -> LiquidValue { - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil, Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { - raw_collections::value::Number::PosInt(number) => { + Number::PosInt(number) => { let number: i64 = match (*number).try_into() { Ok(number) => number, Err(_) => { @@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { }; LiquidValue::Scalar(ScalarCow::new(number)) } - raw_collections::value::Number::NegInt(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } - raw_collections::value::Number::Finite(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } + Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)), + Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)), }, Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), @@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn as_scalar(&self) -> Option> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { @@ -576,34 +580,35 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn is_scalar(&self) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) } fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { - if let raw_collections::Value::Array(array) = &self.value { + if let Value::Array(array) = &self.value { return Some(ParseableArray::as_parseable(array) as _); } None } fn is_array(&self) -> bool { - matches!(&self.value, raw_collections::Value::Array(_)) + matches!(&self.value, bumparaw_collections::Value::Array(_)) } fn as_object(&self) -> Option<&dyn ObjectView> { - if let raw_collections::Value::Object(object) = &self.value { + if let Value::Object(object) = &self.value { return Some(ParseableMap::as_parseable(object) as _); } None } fn is_object(&self) -> bool { - matches!(&self.value, raw_collections::Value::Object(_)) + matches!(&self.value, bumparaw_collections::Value::Object(_)) } fn is_nil(&self) -> bool { - matches!(&self.value, raw_collections::Value::Null) + matches!(&self.value, bumparaw_collections::Value::Null) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b1a2218f2..2beefc7d5 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -1,7 +1,7 @@ use std::collections::{BTreeMap, BTreeSet}; +use bumparaw_collections::RawMap; use heed::RoTxn; -use raw_collections::RawMap; use serde_json::value::RawValue; use super::vector_document::VectorDocument; diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 658a3127c..09ca60211 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -69,12 +69,12 @@ use std::io::BufReader; use std::{io, iter, mem}; use bumpalo::Bump; +use bumparaw_collections::bbbul::{BitPacker, BitPacker4x}; +use bumparaw_collections::map::FrozenMap; +use bumparaw_collections::{Bbbul, FrozenBbbul}; use grenad::ReaderCursor; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use raw_collections::bbbul::{BitPacker, BitPacker4x}; -use raw_collections::map::FrozenMap; -use raw_collections::{Bbbul, FrozenBbbul}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index ffdce5b7e..3aa546272 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -176,9 +176,9 @@ pub fn tokenizer_builder<'a>( #[cfg(test)] mod test { use bumpalo::Bump; + use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; use serde_json::json; use serde_json::value::RawValue; diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index c9808360e..7fd983f29 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -1,6 +1,7 @@ use std::ops::ControlFlow; use bumpalo::Bump; +use bumparaw_collections::RawVec; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -360,7 +361,7 @@ impl<'a> DeserrRawValue<'a> { } pub struct DeserrRawVec<'a> { - vec: raw_collections::RawVec<'a>, + vec: RawVec<'a>, alloc: &'a Bump, } @@ -379,7 +380,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> { } pub struct DeserrRawVecIter<'a> { - it: raw_collections::vec::iter::IntoIter<'a>, + it: bumparaw_collections::vec::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -393,7 +394,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: raw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a>, alloc: &'a Bump, } @@ -416,7 +417,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> { } pub struct DeserrRawMapIter<'a> { - it: raw_collections::map::iter::IntoIter<'a>, + it: bumparaw_collections::map::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -615,7 +616,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { where A: serde::de::SeqAccess<'de>, { - let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); + let mut raw_vec = RawVec::new_in(self.alloc); while let Some(next) = seq.next_element()? { raw_vec.push(next); } diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 2a381d5d1..139cef11b 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,9 +1,9 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; +use bumparaw_collections::RawMap; use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; -use raw_collections::RawMap; use rayon::slice::ParallelSlice; use serde_json::value::RawValue; use serde_json::Deserializer; @@ -545,8 +545,8 @@ impl MergeChanges for MergeDocumentForReplacement { match operations.last() { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -632,8 +632,8 @@ impl MergeChanges for MergeDocumentForUpdates { } }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; Some(Versions::single(document)) } @@ -647,7 +647,7 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + let document = RawMap::from_raw_value(document, doc_alloc) .map_err(UserError::SerdeJson)?; Ok(document) }); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 59088bd47..00041ecaf 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -4,6 +4,7 @@ use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; +use bumparaw_collections::RawMap; use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; @@ -13,7 +14,6 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; -use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 2cc653813..f687fda99 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -1,5 +1,6 @@ use std::ops::DerefMut; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use serde_json::value::RawValue; @@ -75,8 +76,8 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index a8e3e38a8..59d7098e5 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -1,4 +1,4 @@ -use raw_collections::RawMap; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 319730db0..419c3dc05 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -1,9 +1,9 @@ use std::collections::BTreeSet; use bumpalo::Bump; +use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; -use raw_collections::RawMap; use serde::Serialize; use serde_json::value::RawValue; From d075be798a5ec5086c42adb4882e0917a221fa93 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 10 Dec 2024 11:07:10 +0100 Subject: [PATCH 114/158] Fix tests --- crates/meilisearch/tests/documents/update_documents.rs | 2 +- crates/meilisearch/tests/similar/errors.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index c0703e81b..aaf529ce5 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -172,7 +172,7 @@ async fn error_update_documents_bad_document_id() { assert_eq!( response["error"]["message"], json!( - r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes."# + r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes."# ) ); assert_eq!(response["error"]["code"], json!("invalid_document_id")); diff --git a/crates/meilisearch/tests/similar/errors.rs b/crates/meilisearch/tests/similar/errors.rs index 1e933e1c0..86fca97ad 100644 --- a/crates/meilisearch/tests/similar/errors.rs +++ b/crates/meilisearch/tests/similar/errors.rs @@ -79,7 +79,7 @@ async fn similar_bad_id() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" @@ -172,7 +172,7 @@ async fn similar_invalid_id() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" From 6b269795d25257f34d398b8198386e3a3c768f60 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 14:25:13 +0100 Subject: [PATCH 115/158] Update bumparaw-collections to 0.1.2 --- Cargo.lock | 4 ++-- crates/index-scheduler/Cargo.toml | 2 +- crates/meilisearch-types/Cargo.toml | 2 +- crates/milli/Cargo.toml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a57391bfc..34bea88da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -708,9 +708,9 @@ dependencies = [ [[package]] name = "bumparaw-collections" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7495aa71334069997d1b4ff536a4a01542981774a1654d4dfb00f29db3aedcef" +checksum = "833a74d1cb25094593307c17044e4140828b553d1d653bc3ec9928aa88a6d88a" dependencies = [ "allocator-api2", "bitpacking", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index a2b9debec..5d7eb1913 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -14,7 +14,7 @@ license.workspace = true anyhow = "1.0.86" bincode = "1.3.3" bumpalo = "3.16.0" -bumparaw-collections = "0.1.1" +bumparaw-collections = "0.1.2" csv = "1.3.0" derive_builder = "0.20.0" dump = { path = "../dump" } diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index b91689ed7..e81e6dd35 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } -bumparaw-collections = "0.1.1" +bumparaw-collections = "0.1.2" roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index ae1edd168..9f113e013 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -92,7 +92,7 @@ url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.15.0" bumpalo = "3.16.0" -bumparaw-collections = "0.1.1" +bumparaw-collections = "0.1.2" thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" From a751972c5726ff0a23dc433fd9f0702f88e153b9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 14:25:53 +0100 Subject: [PATCH 116/158] Prefer using a stable than a random hash builder --- crates/milli/src/update/new/document.rs | 7 ++++--- .../new/extract/searchable/tokenize_document.rs | 3 ++- crates/milli/src/update/new/indexer/de.rs | 3 ++- .../update/new/indexer/document_operation.rs | 17 +++++++++++------ crates/milli/src/update/new/indexer/mod.rs | 3 ++- .../src/update/new/indexer/partial_dump.rs | 5 +++-- .../update/new/indexer/update_by_function.rs | 9 +++++++-- crates/milli/src/update/new/vector_document.rs | 16 +++++++++------- 8 files changed, 40 insertions(+), 23 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 2beefc7d5..930b0c078 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet}; use bumparaw_collections::RawMap; use heed::RoTxn; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::vector_document::VectorDocument; @@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue); #[derive(Debug)] pub struct Versions<'doc> { - data: RawMap<'doc>, + data: RawMap<'doc, FxBuildHasher>, } impl<'doc> Versions<'doc> { pub fn multiple( - mut versions: impl Iterator>>, + mut versions: impl Iterator>>, ) -> Result> { let Some(data) = versions.next() else { return Ok(None) }; let mut data = data?; @@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> { Ok(Some(Self::single(data))) } - pub fn single(version: RawMap<'doc>) -> Self { + pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self { Self { data: version } } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 3aa546272..1c1605b66 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -179,6 +179,7 @@ mod test { use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; + use rustc_hash::FxBuildHasher; use serde_json::json; use serde_json::value::RawValue; @@ -234,7 +235,7 @@ mod test { let bump = Bump::new(); let document: &RawValue = serde_json::from_str(&document).unwrap(); - let document = RawMap::from_raw_value(document, &bump).unwrap(); + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap(); let document = Versions::single(document); let document = DocumentFromVersions::new(&document); diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index 7fd983f29..4d9fa40a1 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -2,6 +2,7 @@ use std::ops::ControlFlow; use bumpalo::Bump; use bumparaw_collections::RawVec; +use rustc_hash::FxBuildHasher; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -394,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: bumparaw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a, FxBuildHasher>, alloc: &'a Bump, } diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 139cef11b..0b7ec493e 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -5,6 +5,7 @@ use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; use rayon::slice::ParallelSlice; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use serde_json::Deserializer; @@ -166,8 +167,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( // Only guess the primary key if it is the first document let retrieved_primary_key = if previous_offset == 0 { - let doc = - RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; + let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer) + .map(Some) + .map_err(UserError::SerdeJson)?; let result = retrieve_or_guess_primary_key( rtxn, @@ -546,7 +548,8 @@ impl MergeChanges for MergeDocumentForReplacement { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); let document = - RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -633,7 +636,8 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); let document = - RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Some(Versions::single(document)) } @@ -647,8 +651,9 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); - let document = RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Ok(document) }); Versions::multiple(versions)? diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 00041ecaf..601645385 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -14,6 +14,7 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; +use rustc_hash::FxBuildHasher; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -776,7 +777,7 @@ pub fn retrieve_or_guess_primary_key<'a>( index: &Index, new_fields_ids_map: &mut FieldsIdsMap, primary_key_from_op: Option<&'a str>, - first_document: Option>, + first_document: Option>, ) -> Result, bool), UserError>> { // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index f687fda99..6e4abd898 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -2,6 +2,7 @@ use std::ops::DerefMut; use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::document_changes::{DocumentChangeContext, DocumentChanges}; @@ -76,8 +77,8 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = - RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?; + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index 59d7098e5..3001648e6 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -3,6 +3,7 @@ use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; +use rustc_hash::FxBuildHasher; use super::document_changes::DocumentChangeContext; use super::DocumentChanges; @@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { if document_id != new_document_id { Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) } else { - let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let raw_new_doc = RawMap::from_raw_value_and_hasher( + raw_new_doc, + FxBuildHasher, + doc_alloc, + ) + .map_err(InternalError::SerdeJson)?; Ok(Some(DocumentChange::Update(Update::create( docid, diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 419c3dc05..8d14a749d 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -4,6 +4,7 @@ use bumpalo::Bump; use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; +use rustc_hash::FxBuildHasher; use serde::Serialize; use serde_json::value::RawValue; @@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> { docid: DocumentId, embedding_config: Vec, index: &'t Index, - vectors_field: Option>, + vectors_field: Option>, rtxn: &'t RoTxn<'t>, doc_alloc: &'t Bump, } @@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> { }; let vectors = document.vectors_field()?; let vectors_field = match vectors { - Some(vectors) => { - Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) - } + Some(vectors) => Some( + RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?, + ), None => None, }; @@ -220,7 +222,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, - vectors: RawMap<'doc>, + vectors: RawMap<'doc, FxBuildHasher>, embedders: &'doc EmbeddingConfigs, } @@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> { ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { - let vectors = - RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; + let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump) + .map_err(UserError::SerdeJson)?; Ok(Some(Self { external_document_id, vectors, embedders })) } else { Ok(None) From aeb6b74725b3eecda3eecec20b4f37d815dc929c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 15:52:22 +0100 Subject: [PATCH 117/158] Make sure we use an FxHashBuilder on the Value --- Cargo.lock | 4 ++-- crates/milli/src/prompt/document.rs | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 34bea88da..9476506ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -708,9 +708,9 @@ dependencies = [ [[package]] name = "bumparaw-collections" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833a74d1cb25094593307c17044e4140828b553d1d653bc3ec9928aa88a6d88a" +checksum = "4ce682bdc86c2e25ef5cd95881d9d6a1902214eddf74cf9ffea88fe1464377e8" dependencies = [ "allocator-api2", "bitpacking", diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index 5232b6788..ae0a506ac 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -9,6 +9,7 @@ use liquid::model::{ Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { - fn as_debug(&self) -> &dyn fmt::Debug { + fn as_debug(&self) -> &dyn Debug { self } fn render(&self) -> liquid::model::DisplayCow<'_> { @@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, } } -#[derive(Debug)] struct ParseableValue<'doc> { - value: Value<'doc>, + value: Value<'doc, FxBuildHasher>, } impl<'doc> ParseableValue<'doc> { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { - let value = Value::from_raw_value(value, doc_alloc).unwrap(); + let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap(); Self { value } } @@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> { } // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] -struct ParseableMap<'doc>(RawMap<'doc>); +#[repr(transparent)] +struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>); // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] +#[repr(transparent)] struct ParseableArray<'doc>(RawVec<'doc>); impl<'doc> ParseableMap<'doc> { - pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { + pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> { // SAFETY: repr(transparent) - unsafe { &*(map as *const RawMap as *const Self) } + unsafe { &*(map as *const RawMap as *const Self) } } } @@ -612,6 +612,12 @@ impl<'doc> ValueView for ParseableValue<'doc> { } } +impl Debug for ParseableValue<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ParseableValue").field("value", &self.value).finish() + } +} + struct ArraySource<'s, 'doc> { s: &'s RawVec<'doc>, } From bb00e70087a58328dc1062ddc766c68097aeadd2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 10:39:04 +0100 Subject: [PATCH 118/158] Reintroduce the document addition logs --- crates/index-scheduler/src/batch.rs | 35 ++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 9a3ba4929..93e9a1404 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -33,7 +33,9 @@ use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; -use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; +use meilisearch_types::milli::update::{ + DocumentAdditionResult, IndexDocumentsMethod, Settings as MilliSettings, +}; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; @@ -1310,9 +1312,9 @@ impl IndexScheduler { ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; - let mut addition = 0; + let mut candidates_count = 0; for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { - addition += stats.document_count; + candidates_count += stats.document_count; match stats.error { Some(error) => { task.status = Status::Failed; @@ -1358,6 +1360,13 @@ impl IndexScheduler { ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1436,6 +1445,7 @@ impl IndexScheduler { } }; + let candidates_count = candidates.len(); let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); let document_changes = pool .install(|| { @@ -1464,7 +1474,14 @@ impl IndexScheduler { ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } match result_count { @@ -1585,6 +1602,7 @@ impl IndexScheduler { }; let mut indexer = indexer::DocumentDeletion::new(); + let candidates_count = to_delete.len(); indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index @@ -1607,7 +1625,14 @@ impl IndexScheduler { ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } Ok(tasks) From 479607e5dd9185a1a69ec39f05a6c97be8e87c98 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 10 Dec 2024 15:12:50 +0100 Subject: [PATCH 119/158] Convert update files from OBKV to ndjson --- Cargo.lock | 13 +++--- crates/meilitool/Cargo.toml | 5 ++- crates/meilitool/src/main.rs | 2 +- crates/meilitool/src/upgrade/mod.rs | 5 +++ crates/meilitool/src/upgrade/v1_12.rs | 63 +++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 crates/meilitool/src/upgrade/v1_12.rs diff --git a/Cargo.lock b/Cargo.lock index 9476506ec..ae2715f25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2661,12 +2661,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.15.1", "serde", ] @@ -3597,9 +3597,12 @@ dependencies = [ "clap", "dump", "file-store", + "indexmap", "meilisearch-auth", "meilisearch-types", "serde", + "serde_json", + "tempfile", "time", "uuid", ] @@ -4969,9 +4972,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "indexmap", "itoa", diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 048da6232..7d0b9f32c 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -10,12 +10,15 @@ license.workspace = true [dependencies] anyhow = "1.0.86" +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } clap = { version = "4.5.9", features = ["derive"] } dump = { path = "../dump" } file-store = { path = "../file-store" } +indexmap = {version = "2.7.0", features = ["serde"]} meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } +serde_json = {version = "1.0.133", features = ["preserve_order"]} +tempfile = "3.14.0" time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index f84cea98d..44eb4960e 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -73,7 +73,7 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.x -> v1.10.x -> v1.11.x + /// - v1.9.x -> v1.10.x -> v1.11.x -> v1.12.x OfflineUpgrade { #[arg(long)] target_version: String, diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 36630c3b3..50882f610 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -1,5 +1,6 @@ mod v1_10; mod v1_11; +mod v1_12; mod v1_9; use std::path::{Path, PathBuf}; @@ -8,6 +9,7 @@ use anyhow::{bail, Context}; use meilisearch_types::versioning::create_version_file; use v1_10::v1_9_to_v1_10; +use v1_12::v1_11_to_v1_12; use crate::upgrade::v1_11::v1_10_to_v1_11; @@ -22,6 +24,7 @@ impl OfflineUpgrade { let upgrade_list = [ (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), (v1_10_to_v1_11, "1", "11", "0"), + (v1_11_to_v1_12, "1", "12", "0"), ]; let (current_major, current_minor, current_patch) = &self.current_version; @@ -33,6 +36,7 @@ impl OfflineUpgrade { ) { ("1", "9", _) => 0, ("1", "10", _) => 1, + ("1", "11", _) => 2, _ => { bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") } @@ -43,6 +47,7 @@ impl OfflineUpgrade { let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { ("1", "10", _) => 0, ("1", "11", _) => 1, + ("1", "12", _) => 2, (major, _, _) if major.starts_with('v') => { bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") } diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs new file mode 100644 index 000000000..ab97f417b --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -0,0 +1,63 @@ +//! The breaking changes that happened between the v1.11 and the v1.12 are: +//! - The new indexer changed the update files format from OBKV to ndjson. https://github.com/meilisearch/meilisearch/pull/4900 + +use std::{io::BufWriter, path::Path}; + +use anyhow::Context; +use file_store::FileStore; +use indexmap::IndexMap; +use meilisearch_types::milli::documents::DocumentsBatchReader; +use serde_json::value::RawValue; +use tempfile::NamedTempFile; + +pub fn v1_11_to_v1_12(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.11.0 to v1.12.0"); + + convert_update_files(db_path)?; + + Ok(()) +} + +/// Convert the update files from OBKV to ndjson format. +/// +/// 1) List all the update files using the file store. +/// 2) For each update file, read the update file into a DocumentsBatchReader. +/// 3) For each document in the update file, convert the document to a JSON object. +/// 4) Write the JSON object to a tmp file in the update files directory. +/// 5) Persist the tmp file replacing the old update file. +fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { + let update_files_dir_path = db_path.join("update_files"); + let file_store = FileStore::new(&update_files_dir_path)?; + + for uuid in file_store.all_uuids()? { + let uuid = uuid?; + let update_file_path = file_store.get_update_path(uuid); + let update_file = file_store.get_update(uuid)?; + + let mut file = NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new)?; + + let reader = DocumentsBatchReader::from_reader(update_file)?; + let (mut cursor, index) = reader.into_cursor_and_fields_index(); + + while let Some(document) = cursor.next_document()? { + let mut json_document = IndexMap::new(); + for (fid, value) in document { + let field_name = index + .name(fid) + .with_context(|| format!("while getting field name for fid {fid}"))?; + let value: &RawValue = serde_json::from_slice(value)?; + json_document.insert(field_name, value); + } + + serde_json::to_writer(&mut file, &json_document)?; + } + + let file = file + .into_inner() + .map_err(|e| e.into_error()) + .context("while flushing update file bufwriter")?; + let _ = file.persist(update_file_path)?; + } + + Ok(()) +} From c614d0dd353947c46de2da8635e6e4b8e0b8404c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 11 Dec 2024 09:54:34 +0100 Subject: [PATCH 120/158] Add context when returning an error --- crates/meilitool/src/upgrade/v1_12.rs | 40 ++++++++++++++++++--------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs index ab97f417b..77060d90d 100644 --- a/crates/meilitool/src/upgrade/v1_12.rs +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -27,24 +27,37 @@ pub fn v1_11_to_v1_12(db_path: &Path) -> anyhow::Result<()> { /// 5) Persist the tmp file replacing the old update file. fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { let update_files_dir_path = db_path.join("update_files"); - let file_store = FileStore::new(&update_files_dir_path)?; + let file_store = FileStore::new(&update_files_dir_path).with_context(|| { + format!("while creating file store for update files dir {update_files_dir_path:?}") + })?; - for uuid in file_store.all_uuids()? { - let uuid = uuid?; + for uuid in file_store.all_uuids().context("while retrieving uuids from file store")? { + let uuid = uuid.context("while retrieving uuid from file store")?; let update_file_path = file_store.get_update_path(uuid); - let update_file = file_store.get_update(uuid)?; + let update_file = file_store + .get_update(uuid) + .with_context(|| format!("while getting update file for uuid {uuid:?}"))?; - let mut file = NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new)?; + let mut file = + NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new).with_context( + || format!("while creating bufwriter for update file {update_file_path:?}"), + )?; - let reader = DocumentsBatchReader::from_reader(update_file)?; + let reader = DocumentsBatchReader::from_reader(update_file).with_context(|| { + format!("while creating documents batch reader for update file {update_file_path:?}") + })?; let (mut cursor, index) = reader.into_cursor_and_fields_index(); - while let Some(document) = cursor.next_document()? { + while let Some(document) = cursor.next_document().with_context(|| { + format!( + "while reading documents from batch reader for update file {update_file_path:?}" + ) + })? { let mut json_document = IndexMap::new(); for (fid, value) in document { let field_name = index .name(fid) - .with_context(|| format!("while getting field name for fid {fid}"))?; + .with_context(|| format!("while getting field name for fid {fid} for update file {update_file_path:?}"))?; let value: &RawValue = serde_json::from_slice(value)?; json_document.insert(field_name, value); } @@ -52,11 +65,12 @@ fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { serde_json::to_writer(&mut file, &json_document)?; } - let file = file - .into_inner() - .map_err(|e| e.into_error()) - .context("while flushing update file bufwriter")?; - let _ = file.persist(update_file_path)?; + let file = file.into_inner().map_err(|e| e.into_error()).context(format!( + "while flushing update file bufwriter for update file {update_file_path:?}" + ))?; + let _ = file + .persist(&update_file_path) + .with_context(|| format!("while persisting update file {update_file_path:?}"))?; } Ok(()) From d683f5980ce7232c7e9be4d0b3d3f5aefb0335af Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 19:19:27 +0100 Subject: [PATCH 121/158] Do not duplicate NDJson when unecessary --- crates/file-store/src/lib.rs | 8 ++ .../meilisearch-types/src/document_formats.rs | 21 ++- .../src/routes/indexes/documents.rs | 127 +++++++++++------- 3 files changed, 98 insertions(+), 58 deletions(-) diff --git a/crates/file-store/src/lib.rs b/crates/file-store/src/lib.rs index c8b3849ab..39ed9482b 100644 --- a/crates/file-store/src/lib.rs +++ b/crates/file-store/src/lib.rs @@ -136,6 +136,14 @@ pub struct File { } impl File { + pub fn from_parts(path: PathBuf, file: Option) -> Self { + Self { path, file } + } + + pub fn into_parts(self) -> (PathBuf, Option) { + (self.path, self.file) + } + pub fn dry_file() -> Result { Ok(Self { path: PathBuf::new(), file: None }) } diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index c6e8ad907..4820ac523 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -250,26 +250,25 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { } } -/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way. -pub fn read_ndjson(input: &File, output: impl io::Write) -> Result { +/// Reads NDJSON from file and checks it. +pub fn read_ndjson(input: &File) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut output = BufWriter::new(output); - let mut bump = Bump::with_capacity(1024 * 1024); let mut count = 0; for result in serde_json::Deserializer::from_slice(&input).into_iter() { bump.reset(); - count += 1; - result - .and_then(|raw: &RawValue| { + match result { + Ok(raw) => { // try to deserialize as a map - let map = RawMap::from_raw_value(raw, &bump)?; - to_writer(&mut output, &map) - }) - .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + RawMap::from_raw_value(raw, &bump) + .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + count += 1; + } + Err(e) => return Err(DocumentFormatError::from((PayloadType::Ndjson, e))), + } } Ok(count) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 47f73ef42..0b18810d7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use std::io::ErrorKind; +use std::io::{ErrorKind, Seek as _}; use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; @@ -572,7 +572,7 @@ async fn document_addition( index_uid: IndexUid, primary_key: Option, csv_delimiter: Option, - mut body: Payload, + body: Payload, method: IndexDocumentsMethod, task_id: Option, dry_run: bool, @@ -609,54 +609,54 @@ async fn document_addition( }; let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; + let documents_count = match format { + PayloadType::Ndjson => { + let (path, file) = update_file.into_parts(); + let file = match file { + Some(file) => { + let (file, path) = file.into_parts(); + let mut file = copy_body_to_file(file, body, format).await?; + file.rewind().map_err(|e| { + index_scheduler::Error::FileStore(file_store::Error::IoError(e)) + })?; + Some(tempfile::NamedTempFile::from_parts(file, path)) + } + None => None, + }; - let temp_file = match tempfile() { - Ok(file) => file, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + let documents_count = file + .as_ref() + .map_or(Ok(0), |ntf| read_ndjson(ntf.as_file())) + .map_err(|e| MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + let update_file = file_store::File::from_parts(path, file); + update_file.persist()?; + Ok(documents_count) + } + PayloadType::Json | PayloadType::Csv { delimiter: _ } => { + let temp_file = match tempfile() { + Ok(file) => file, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + }; + + let read_file = copy_body_to_file(temp_file, body, format).await?; + tokio::task::spawn_blocking(move || { + let documents_count = match format { + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => { + read_csv(&read_file, &mut update_file, delimiter)? + } + PayloadType::Ndjson => { + unreachable!("We already wrote the user content into the update file") + } + }; + // we NEED to persist the file here because we moved the `udpate_file` in another task. + update_file.persist()?; + Ok(documents_count) + }) + .await + } }; - let async_file = File::from_std(temp_file); - let mut buffer = BufWriter::new(async_file); - - let mut buffer_write_size: usize = 0; - while let Some(result) = body.next().await { - let byte = result?; - - if byte.is_empty() && buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - match buffer.write_all(&byte).await { - Ok(()) => buffer_write_size += 1, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), - } - } - - if let Err(e) = buffer.flush().await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - if buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - let read_file = buffer.into_inner().into_std().await; - let documents_count = tokio::task::spawn_blocking(move || { - let documents_count = match format { - PayloadType::Json => read_json(&read_file, &mut update_file)?, - PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, - PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, - }; - // we NEED to persist the file here because we moved the `udpate_file` in another task. - update_file.persist()?; - Ok(documents_count) - }) - .await; - let documents_count = match documents_count { Ok(Ok(documents_count)) => documents_count, // in this case the file has not possibly be persisted. @@ -703,6 +703,39 @@ async fn document_addition( Ok(task.into()) } +async fn copy_body_to_file( + output: std::fs::File, + mut body: Payload, + format: PayloadType, +) -> Result { + let async_file = File::from_std(output); + let mut buffer = BufWriter::new(async_file); + let mut buffer_write_size: usize = 0; + while let Some(result) = body.next().await { + let byte = result?; + + if byte.is_empty() && buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + + match buffer.write_all(&byte).await { + Ok(()) => buffer_write_size += 1, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + } + } + if let Err(e) = buffer.flush().await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + if buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + let read_file = buffer.into_inner().into_std().await; + Ok(read_file) +} + pub async fn delete_documents_batch( index_scheduler: GuardedData, Data>, index_uid: web::Path, From 69c931334fc6387b2ee92b6016762b5dac898be5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 10:08:49 +0100 Subject: [PATCH 122/158] Fix the error messages categorization with invalid NDJson --- crates/meilisearch/src/routes/indexes/documents.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 0b18810d7..264365704 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -624,13 +624,12 @@ async fn document_addition( None => None, }; - let documents_count = file - .as_ref() - .map_or(Ok(0), |ntf| read_ndjson(ntf.as_file())) - .map_err(|e| MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + let documents_count = file.as_ref().map_or(Ok(0), |ntf| { + read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat) + })?; let update_file = file_store::File::from_parts(path, file); update_file.persist()?; - Ok(documents_count) + Ok(Ok(documents_count)) } PayloadType::Json | PayloadType::Csv { delimiter: _ } => { let temp_file = match tempfile() { From 93fbdc06d3098694e0ce0e21ebde91c8bf92c4d3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 12:02:38 +0100 Subject: [PATCH 123/158] Use a nonrandom hasher when decoding NDJSON --- Cargo.lock | 9 +++++---- crates/meilisearch-types/Cargo.toml | 1 + crates/meilisearch-types/src/document_formats.rs | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9476506ec..349bed5db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3577,6 +3577,7 @@ dependencies = [ "memmap2", "milli", "roaring", + "rustc-hash 2.1.0", "serde", "serde-cs", "serde_json", @@ -3676,7 +3677,7 @@ dependencies = [ "rhai", "roaring", "rstar", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "serde", "serde_json", "slice-group-by", @@ -4425,7 +4426,7 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls", "slab", "thiserror", @@ -4798,9 +4799,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index e81e6dd35..76d8d11ca 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -26,6 +26,7 @@ memmap2 = "0.9.4" milli = { path = "../milli" } bumparaw-collections = "0.1.2" roaring = { version = "0.10.7", features = ["serde"] } +rustc-hash = "2.1.0" serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" serde_json = "1.0.120" diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 4820ac523..d858b3c17 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -8,6 +8,7 @@ use bumparaw_collections::RawMap; use memmap2::Mmap; use milli::documents::Error; use milli::Object; +use rustc_hash::FxBuildHasher; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; @@ -263,7 +264,7 @@ pub fn read_ndjson(input: &File) -> Result { match result { Ok(raw) => { // try to deserialize as a map - RawMap::from_raw_value(raw, &bump) + RawMap::from_raw_value_and_hasher(raw, FxBuildHasher, &bump) .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; count += 1; } From 01bcc601beb72f0011568e164f459309530185f6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 12:04:29 +0100 Subject: [PATCH 124/158] Use a nonrandom hasher when decoding JSON --- crates/meilisearch-types/src/document_formats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index d858b3c17..70a0e6204 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -221,7 +221,7 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { let mut deserializer = serde_json::Deserializer::from_slice(&input); let res = array_each(&mut deserializer, |obj: &RawValue| { doc_alloc.reset(); - let map = RawMap::from_raw_value(obj, &doc_alloc)?; + let map = RawMap::from_raw_value_and_hasher(obj, FxBuildHasher, &doc_alloc)?; to_writer(&mut out, &map) }); let count = match res { From 5622b9607d4abd6afbf32ecba3e9e25e8eaa4131 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 12:18:36 +0100 Subject: [PATCH 125/158] Wrap the read NDJSON pass into a tokio blocking --- .../src/routes/indexes/documents.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 264365704..5f79000bd 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -624,12 +624,19 @@ async fn document_addition( None => None, }; - let documents_count = file.as_ref().map_or(Ok(0), |ntf| { - read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat) - })?; - let update_file = file_store::File::from_parts(path, file); - update_file.persist()?; - Ok(Ok(documents_count)) + let documents_count = tokio::task::spawn_blocking(move || { + let documents_count = file.as_ref().map_or(Ok(0), |ntf| { + read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat) + })?; + + let update_file = file_store::File::from_parts(path, file); + update_file.persist()?; + + Ok(documents_count) + }) + .await?; + + Ok(documents_count) } PayloadType::Json | PayloadType::Csv { delimiter: _ } => { let temp_file = match tempfile() { From 5c492031d9155139191e1b175259db86f7aead06 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Wed, 11 Dec 2024 14:34:18 +0100 Subject: [PATCH 126/158] Update crates/meilitool/src/upgrade/v1_12.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/upgrade/v1_12.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs index 77060d90d..85fb41472 100644 --- a/crates/meilitool/src/upgrade/v1_12.rs +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -69,6 +69,7 @@ fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { "while flushing update file bufwriter for update file {update_file_path:?}" ))?; let _ = file + // atomically replace the obkv file with the rewritten NDJSON file .persist(&update_file_path) .with_context(|| format!("while persisting update file {update_file_path:?}"))?; } From 04a62d2b97e6333645e6b1ba898bb02efdb11877 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 14:57:07 +0100 Subject: [PATCH 127/158] Compile Meilisearch or run the dedicated binary file --- crates/xtask/src/bench/meili_process.rs | 11 +---------- crates/xtask/src/bench/mod.rs | 5 +++++ crates/xtask/src/bench/workload.rs | 25 ++++++++++++++++++++++--- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/crates/xtask/src/bench/meili_process.rs b/crates/xtask/src/bench/meili_process.rs index 99f6f4ea6..db787e595 100644 --- a/crates/xtask/src/bench/meili_process.rs +++ b/crates/xtask/src/bench/meili_process.rs @@ -37,17 +37,8 @@ pub async fn start( master_key: Option<&str>, workload: &Workload, asset_folder: &str, + mut command: tokio::process::Command, ) -> anyhow::Result { - let mut command = tokio::process::Command::new("cargo"); - command - .arg("run") - .arg("--release") - .arg("-p") - .arg("meilisearch") - .arg("--bin") - .arg("meilisearch") - .arg("--"); - command.arg("--db-path").arg("./_xtask_benchmark.ms"); if let Some(master_key) = master_key { command.arg("--master-key").arg(master_key); diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index deec120fa..491dc33ab 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -86,6 +86,10 @@ pub struct BenchDeriveArgs { /// The maximum time in seconds we allow for fetching the task queue before timing out. #[arg(long, default_value_t = 60)] tasks_queue_timeout_secs: u64, + + /// The path to the binary to run. By default it compiles the binary with cargo. + #[arg(long)] + binary_path: Option, } pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { @@ -170,6 +174,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { args.master_key.as_deref(), workload, &args, + args.binary_path.as_deref(), ) .await?; diff --git a/crates/xtask/src/bench/workload.rs b/crates/xtask/src/bench/workload.rs index 19c8bfae8..649bd0eaf 100644 --- a/crates/xtask/src/bench/workload.rs +++ b/crates/xtask/src/bench/workload.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; use std::fs::File; use std::io::{Seek as _, Write as _}; +use std::path::Path; use anyhow::{bail, Context as _}; use futures_util::TryStreamExt as _; @@ -85,14 +86,30 @@ pub async fn execute( master_key: Option<&str>, workload: Workload, args: &BenchDeriveArgs, + binary_path: Option<&Path>, ) -> anyhow::Result<()> { assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; let workload_uuid = dashboard_client.create_workload(invocation_uuid, &workload).await?; let mut tasks = Vec::new(); - for i in 0..workload.run_count { + let run_command = match binary_path { + Some(binary_path) => tokio::process::Command::new(binary_path), + None => { + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + command + } + }; + tasks.push( execute_run( dashboard_client, @@ -102,6 +119,7 @@ pub async fn execute( master_key, &workload, args, + run_command, i, ) .await?, @@ -109,7 +127,6 @@ pub async fn execute( } let mut reports = Vec::with_capacity(workload.run_count as usize); - for task in tasks { reports.push( task.await @@ -133,13 +150,15 @@ async fn execute_run( master_key: Option<&str>, workload: &Workload, args: &BenchDeriveArgs, + run_command: tokio::process::Command, run_number: u16, ) -> anyhow::Result>> { meili_process::delete_db(); meili_process::build().await?; let meilisearch = - meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?; + meili_process::start(meili_client, master_key, workload, &args.asset_folder, run_command) + .await?; let processor = run_commands( dashboard_client, From bfca54cc2cd5cc65d54dfeb7aa9b58103d0464b7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 11 Dec 2024 15:26:18 +0100 Subject: [PATCH 128/158] Return docid in case of errors while rendering the document template --- crates/milli/src/prompt/error.rs | 12 ++++++++++++ crates/milli/src/prompt/mod.rs | 10 +++++++--- crates/milli/src/update/new/extract/vectors/mod.rs | 6 ++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/prompt/error.rs b/crates/milli/src/prompt/error.rs index 8a762b60a..a92e2fdc3 100644 --- a/crates/milli/src/prompt/error.rs +++ b/crates/milli/src/prompt/error.rs @@ -38,6 +38,16 @@ pub struct RenderPromptError { pub fault: FaultSource, } impl RenderPromptError { + pub(crate) fn missing_context_with_external_docid( + external_docid: String, + inner: liquid::Error, + ) -> RenderPromptError { + Self { + kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner), + fault: FaultSource::User, + } + } + pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError { Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User } } @@ -47,6 +57,8 @@ impl RenderPromptError { pub enum RenderPromptErrorKind { #[error("missing field in document: {0}")] MissingContext(liquid::Error), + #[error("missing field in document `{0}`: {1}")] + MissingContextWithExternalDocid(String, liquid::Error), } impl From for crate::Error { diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index bbcf054e6..3eb91611e 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -119,6 +119,7 @@ impl Prompt { 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents >( &self, + external_docid: &str, document: impl crate::update::new::document::Document<'a> + Debug, field_id_map: &RefCell, doc_alloc: &'doc Bump, @@ -130,9 +131,12 @@ impl Prompt { self.max_bytes.unwrap_or_else(default_max_bytes).get(), doc_alloc, ); - self.template - .render_to(&mut rendered, &context) - .map_err(RenderPromptError::missing_context)?; + self.template.render_to(&mut rendered, &context).map_err(|liquid_error| { + RenderPromptError::missing_context_with_external_docid( + external_docid.to_owned(), + liquid_error, + ) + })?; Ok(std::str::from_utf8(rendered.into_bump_slice()) .expect("render can only write UTF-8 because all inputs and processing preserve utf-8")) } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 1110432fa..2a72a1650 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -130,6 +130,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { ); } else if new_vectors.regenerate { let new_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -139,6 +140,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.doc_alloc, )?; let old_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -158,6 +160,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { } } else if old_vectors.regenerate { let old_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -167,6 +170,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.doc_alloc, )?; let new_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -216,6 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { ); } else if new_vectors.regenerate { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, @@ -229,6 +234,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { } } else { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, From eaa897d983d2c71b6f76a453f1739980ba980558 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 15:57:16 +0100 Subject: [PATCH 129/158] Avoid compiling when unecessary --- crates/xtask/src/bench/workload.rs | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/crates/xtask/src/bench/workload.rs b/crates/xtask/src/bench/workload.rs index 649bd0eaf..39119428f 100644 --- a/crates/xtask/src/bench/workload.rs +++ b/crates/xtask/src/bench/workload.rs @@ -94,22 +94,6 @@ pub async fn execute( let mut tasks = Vec::new(); for i in 0..workload.run_count { - let run_command = match binary_path { - Some(binary_path) => tokio::process::Command::new(binary_path), - None => { - let mut command = tokio::process::Command::new("cargo"); - command - .arg("run") - .arg("--release") - .arg("-p") - .arg("meilisearch") - .arg("--bin") - .arg("meilisearch") - .arg("--"); - command - } - }; - tasks.push( execute_run( dashboard_client, @@ -119,7 +103,7 @@ pub async fn execute( master_key, &workload, args, - run_command, + binary_path, i, ) .await?, @@ -150,12 +134,28 @@ async fn execute_run( master_key: Option<&str>, workload: &Workload, args: &BenchDeriveArgs, - run_command: tokio::process::Command, + binary_path: Option<&Path>, run_number: u16, ) -> anyhow::Result>> { meili_process::delete_db(); - meili_process::build().await?; + let run_command = match binary_path { + Some(binary_path) => tokio::process::Command::new(binary_path), + None => { + meili_process::build().await?; + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + command + } + }; + let meilisearch = meili_process::start(meili_client, master_key, workload, &args.asset_folder, run_command) .await?; From df9b68f8ed965f6d37a3c186ba7e4255a5640dfe Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 10 Dec 2024 16:30:48 +0100 Subject: [PATCH 130/158] inital implementation of the progress --- crates/benchmarks/benches/indexing.rs | 123 +++++------ crates/benchmarks/benches/utils.rs | 5 +- crates/fuzzers/src/bin/fuzz-indexing.rs | 5 +- crates/index-scheduler/src/batch.rs | 63 ++---- crates/index-scheduler/src/insta_snapshot.rs | 2 +- crates/index-scheduler/src/lib.rs | 74 ++----- crates/index-scheduler/src/processing.rs | 205 ++++++++++++++++++ crates/index-scheduler/src/utils.rs | 7 +- crates/meilisearch-types/src/batch_view.rs | 3 + crates/meilisearch-types/src/batches.rs | 3 + crates/meilisearch-types/src/tasks.rs | 57 ----- crates/milli/src/index.rs | 13 +- crates/milli/src/lib.rs | 1 + crates/milli/src/progress.rs | 116 ++++++++++ .../milli/src/search/new/tests/integration.rs | 5 +- .../milli/src/update/index_documents/mod.rs | 47 ++-- .../new/extract/faceted/extract_facets.rs | 20 +- crates/milli/src/update/new/extract/mod.rs | 13 +- .../extract/searchable/extract_word_docids.rs | 20 +- .../src/update/new/extract/searchable/mod.rs | 18 +- .../update/new/indexer/document_changes.rs | 53 +---- .../update/new/indexer/document_deletion.rs | 7 +- .../update/new/indexer/document_operation.rs | 29 ++- crates/milli/src/update/new/indexer/mod.rs | 43 ++-- crates/milli/src/update/new/steps.rs | 47 ++-- .../milli/tests/search/facet_distribution.rs | 5 +- crates/milli/tests/search/mod.rs | 5 +- crates/milli/tests/search/query_criteria.rs | 5 +- crates/milli/tests/search/typo_tolerance.rs | 5 +- 29 files changed, 585 insertions(+), 414 deletions(-) create mode 100644 crates/index-scheduler/src/processing.rs create mode 100644 crates/milli/src/progress.rs diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 870e56686..4acd7b22a 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -8,6 +8,7 @@ use bumpalo::Bump; use criterion::{criterion_group, criterion_main, Criterion}; use milli::documents::PrimaryKey; use milli::heed::{EnvOpenOptions, RwTxn}; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -151,7 +152,7 @@ fn indexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -166,7 +167,7 @@ fn indexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -218,7 +219,7 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -233,7 +234,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -263,7 +264,7 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -278,7 +279,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -332,7 +333,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -347,7 +348,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -409,7 +410,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -424,7 +425,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -454,7 +455,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -469,7 +470,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -495,7 +496,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -510,7 +511,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -563,7 +564,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -578,7 +579,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -630,7 +631,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -645,7 +646,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -697,7 +698,7 @@ fn indexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -712,7 +713,7 @@ fn indexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -763,7 +764,7 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -778,7 +779,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -808,7 +809,7 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -823,7 +824,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -876,7 +877,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -891,7 +892,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -953,7 +954,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -968,7 +969,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -999,7 +1000,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1014,7 +1015,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1041,7 +1042,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1056,7 +1057,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1108,7 +1109,7 @@ fn indexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1123,7 +1124,7 @@ fn indexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1174,7 +1175,7 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1189,7 +1190,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1219,7 +1220,7 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1234,7 +1235,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1287,7 +1288,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1302,7 +1303,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1350,7 +1351,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -125,7 +126,7 @@ pub fn base_setup(conf: &Conf) -> Index { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index ee927940f..08711e5e3 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -10,6 +10,7 @@ use either::Either; use fuzzers::Operation; use milli::documents::mmap_from_objects; use milli::heed::EnvOpenOptions; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig}; use milli::vector::EmbeddingConfigs; @@ -128,7 +129,7 @@ fn main() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -143,7 +144,7 @@ fn main() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 93e9a1404..1bfa7f53b 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -22,8 +22,6 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; -use std::sync::atomic::{self, AtomicU64}; -use std::time::Duration; use bumpalo::collections::CollectIn; use bumpalo::Bump; @@ -32,6 +30,7 @@ use meilisearch_types::batches::BatchId; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; +use meilisearch_types::milli::progress::Progress; use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; use meilisearch_types::milli::update::{ DocumentAdditionResult, IndexDocumentsMethod, Settings as MilliSettings, @@ -41,9 +40,7 @@ use meilisearch_types::milli::vector::parsed_vectors::{ }; use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; -use meilisearch_types::tasks::{ - Details, IndexSwap, Kind, KindWithContent, Status, Task, TaskProgress, -}; +use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; use roaring::RoaringBitmap; use time::macros::format_description; @@ -561,11 +558,12 @@ impl IndexScheduler { /// The list of tasks that were processed. The metadata of each task in the returned /// list is updated accordingly, with the exception of the its date fields /// [`finished_at`](meilisearch_types::tasks::Task::finished_at) and [`started_at`](meilisearch_types::tasks::Task::started_at). - #[tracing::instrument(level = "trace", skip(self, batch), target = "indexing::scheduler", fields(batch=batch.to_string()))] + #[tracing::instrument(level = "trace", skip(self, batch, progress), target = "indexing::scheduler", fields(batch=batch.to_string()))] pub(crate) fn process_batch( &self, batch: Batch, current_batch: &mut ProcessingBatch, + progress: Progress, ) -> Result> { #[cfg(test)] { @@ -953,7 +951,7 @@ impl IndexScheduler { .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; - let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; + let tasks = self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; { let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); @@ -996,6 +994,7 @@ impl IndexScheduler { self.process_batch( Batch::IndexUpdate { index_uid, primary_key, task }, current_batch, + progress, ) } Batch::IndexUpdate { index_uid, primary_key, mut task } => { @@ -1168,7 +1167,7 @@ impl IndexScheduler { /// The list of processed tasks. #[tracing::instrument( level = "trace", - skip(self, index_wtxn, index), + skip(self, index_wtxn, index, progress), target = "indexing::scheduler" )] fn apply_index_operation<'i>( @@ -1176,44 +1175,12 @@ impl IndexScheduler { index_wtxn: &mut RwTxn<'i>, index: &'i Index, operation: IndexOperation, + progress: Progress, ) -> Result> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); - let secs_since_started_processing_at = AtomicU64::new(0); - const PRINT_SECS_DELTA: u64 = 5; - - let processing_tasks = self.processing_tasks.clone(); let must_stop_processing = self.must_stop_processing.clone(); - let send_progress = |progress| { - let now = std::time::Instant::now(); - let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed); - let previous = started_processing_at + Duration::from_secs(elapsed); - let elapsed = now - previous; - - if elapsed.as_secs() < PRINT_SECS_DELTA { - return; - } - - secs_since_started_processing_at - .store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed); - - let TaskProgress { - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps, - } = processing_tasks.write().unwrap().update_progress(progress); - - tracing::info!( - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps - ); - }; match operation { IndexOperation::DocumentClear { index_uid, mut tasks } => { @@ -1308,7 +1275,7 @@ impl IndexScheduler { primary_key.as_deref(), &mut new_fields_ids_map, &|| must_stop_processing.get(), - &send_progress, + progress.clone(), ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; @@ -1356,7 +1323,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &send_progress, + &progress, ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; @@ -1470,7 +1437,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &send_progress, + &progress, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; @@ -1621,7 +1588,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &send_progress, + &progress, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; @@ -1673,12 +1640,14 @@ impl IndexScheduler { index_uid: index_uid.clone(), tasks: cleared_tasks, }, + progress.clone(), )?; let settings_tasks = self.apply_index_operation( index_wtxn, index, IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, + progress, )?; let mut tasks = settings_tasks; @@ -1702,8 +1671,8 @@ impl IndexScheduler { let all_task_ids = self.all_task_ids(wtxn)?; let mut to_delete_tasks = all_task_ids & matched_tasks; - to_delete_tasks -= processing_tasks; - to_delete_tasks -= enqueued_tasks; + to_delete_tasks -= &**processing_tasks; + to_delete_tasks -= &enqueued_tasks; // 2. We now have a list of tasks to delete, delete them diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bcd5966b5..67627d8c1 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -353,7 +353,7 @@ pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database String { let mut snap = String::new(); - let Batch { uid, details, stats, started_at, finished_at } = batch; + let Batch { uid, details, stats, started_at, finished_at, progress: _ } = batch; if let Some(finished_at) = finished_at { assert!(finished_at > started_at); } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index e780b21a1..f5f73087d 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -26,6 +26,7 @@ mod index_mapper; #[cfg(test)] mod insta_snapshot; mod lru; +mod processing; mod utils; pub mod uuid_codec; @@ -56,12 +57,12 @@ use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::index::IndexEmbeddingConfig; -use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::task_view::TaskView; -use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task, TaskProgress}; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use processing::ProcessingTasks; use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -72,7 +73,8 @@ use utils::{filter_out_references_to_newer_tasks, keep_ids_within_datetimes, map use uuid::Uuid; use crate::index_mapper::IndexMapper; -use crate::utils::{check_index_swap_validity, clamp_to_page_size, ProcessingBatch}; +use crate::processing::{AtomicTaskStep, BatchProgress}; +use crate::utils::{check_index_swap_validity, clamp_to_page_size}; pub(crate) type BEI128 = I128; @@ -163,48 +165,6 @@ impl Query { } } -#[derive(Debug, Clone)] -pub struct ProcessingTasks { - batch: Option, - /// The list of tasks ids that are currently running. - processing: RoaringBitmap, - /// The progress on processing tasks - progress: Option, -} - -impl ProcessingTasks { - /// Creates an empty `ProcessingAt` struct. - fn new() -> ProcessingTasks { - ProcessingTasks { batch: None, processing: RoaringBitmap::new(), progress: None } - } - - /// Stores the currently processing tasks, and the date time at which it started. - fn start_processing(&mut self, processing_batch: ProcessingBatch, processing: RoaringBitmap) { - self.batch = Some(processing_batch); - self.processing = processing; - } - - fn update_progress(&mut self, progress: Progress) -> TaskProgress { - self.progress.get_or_insert_with(TaskProgress::default).update(progress) - } - - /// Set the processing tasks to an empty list - fn stop_processing(&mut self) -> Self { - self.progress = None; - - Self { - batch: std::mem::take(&mut self.batch), - processing: std::mem::take(&mut self.processing), - progress: None, - } - } - - /// Returns `true` if there, at least, is one task that is currently processing that we must stop. - fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { - !self.processing.is_disjoint(canceled_tasks) - } -} - #[derive(Default, Clone, Debug)] struct MustStopProcessing(Arc); @@ -813,7 +773,7 @@ impl IndexScheduler { let mut batch_tasks = RoaringBitmap::new(); for batch_uid in batch_uids { if processing_batch.as_ref().map_or(false, |batch| batch.uid == *batch_uid) { - batch_tasks |= &processing_tasks; + batch_tasks |= &*processing_tasks; } else { batch_tasks |= self.tasks_in_batch(rtxn, *batch_uid)?; } @@ -827,13 +787,13 @@ impl IndexScheduler { match status { // special case for Processing tasks Status::Processing => { - status_tasks |= &processing_tasks; + status_tasks |= &*processing_tasks; } status => status_tasks |= &self.get_status(rtxn, *status)?, }; } if !status.contains(&Status::Processing) { - tasks -= &processing_tasks; + tasks -= &*processing_tasks; } tasks &= status_tasks; } @@ -882,7 +842,7 @@ impl IndexScheduler { // Once we have filtered the two subsets, we put them back together and assign it back to `tasks`. tasks = { let (mut filtered_non_processing_tasks, mut filtered_processing_tasks) = - (&tasks - &processing_tasks, &tasks & &processing_tasks); + (&tasks - &*processing_tasks, &tasks & &*processing_tasks); // special case for Processing tasks // A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds @@ -1090,7 +1050,7 @@ impl IndexScheduler { // Once we have filtered the two subsets, we put them back together and assign it back to `batches`. batches = { let (mut filtered_non_processing_batches, mut filtered_processing_batches) = - (&batches - &processing.processing, &batches & &processing.processing); + (&batches - &*processing.processing, &batches & &*processing.processing); // special case for Processing batches // A closure that clears the filtered_processing_batches if their started_at date falls outside the given bounds @@ -1606,7 +1566,8 @@ impl IndexScheduler { // We reset the must_stop flag to be sure that we don't stop processing tasks self.must_stop_processing.reset(); - self.processing_tasks + let progress = self + .processing_tasks .write() .unwrap() // We can clone the processing batch here because we don't want its modification to affect the view of the processing batches @@ -1619,11 +1580,12 @@ impl IndexScheduler { let res = { let cloned_index_scheduler = self.private_clone(); let processing_batch = &mut processing_batch; + let progress = progress.clone(); std::thread::scope(|s| { let handle = std::thread::Builder::new() .name(String::from("batch-operation")) .spawn_scoped(s, move || { - cloned_index_scheduler.process_batch(batch, processing_batch) + cloned_index_scheduler.process_batch(batch, processing_batch, progress) }) .unwrap(); handle.join().unwrap_or(Err(Error::ProcessBatchPanicked)) @@ -1636,6 +1598,7 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?; + progress.update_progress(BatchProgress::WritingTasksToDisk); processing_batch.finished(); let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; let mut canceled = RoaringBitmap::new(); @@ -1645,12 +1608,15 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchSucceeded); + let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32); + progress.update_progress(task_progress_obj); let mut success = 0; let mut failure = 0; let mut canceled_by = None; #[allow(unused_variables)] for (i, mut task) in tasks.into_iter().enumerate() { + task_progress.fetch_add(1, Ordering::Relaxed); processing_batch.update(&mut task); if task.status == Status::Canceled { canceled.insert(task.uid); @@ -1718,8 +1684,12 @@ impl IndexScheduler { Err(err) => { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchFailed); + let (task_progress, task_progress_obj) = AtomicTaskStep::new(ids.len() as u32); + progress.update_progress(task_progress_obj); + let error: ResponseError = err.into(); for id in ids.iter() { + task_progress.fetch_add(1, Ordering::Relaxed); let mut task = self .get_task(&wtxn, id) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs new file mode 100644 index 000000000..e5e892927 --- /dev/null +++ b/crates/index-scheduler/src/processing.rs @@ -0,0 +1,205 @@ +use crate::utils::ProcessingBatch; +use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}; +use roaring::RoaringBitmap; +use std::{borrow::Cow, sync::Arc}; + +#[derive(Clone)] +pub struct ProcessingTasks { + pub batch: Option>, + /// The list of tasks ids that are currently running. + pub processing: Arc, + /// The progress on processing tasks + pub progress: Option, +} + +impl ProcessingTasks { + /// Creates an empty `ProcessingAt` struct. + pub fn new() -> ProcessingTasks { + ProcessingTasks { batch: None, processing: Arc::new(RoaringBitmap::new()), progress: None } + } + + pub fn get_progress_view(&self) -> Option { + Some(self.progress.as_ref()?.as_progress_view()) + } + + /// Stores the currently processing tasks, and the date time at which it started. + pub fn start_processing( + &mut self, + processing_batch: ProcessingBatch, + processing: RoaringBitmap, + ) -> Progress { + self.batch = Some(Arc::new(processing_batch)); + self.processing = Arc::new(processing); + let progress = Progress::default(); + progress.update_progress(BatchProgress::ProcessingTasks); + self.progress = Some(progress.clone()); + + progress + } + + /// Set the processing tasks to an empty list + pub fn stop_processing(&mut self) -> Self { + self.progress = None; + + Self { + batch: std::mem::take(&mut self.batch), + processing: std::mem::take(&mut self.processing), + progress: None, + } + } + + /// Returns `true` if there, at least, is one task that is currently processing that we must stop. + pub fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { + !self.processing.is_disjoint(canceled_tasks) + } +} + +#[repr(u8)] +#[derive(Copy, Clone)] +pub enum BatchProgress { + ProcessingTasks, + WritingTasksToDisk, +} + +impl Step for BatchProgress { + fn name(&self) -> Cow<'static, str> { + match self { + BatchProgress::ProcessingTasks => Cow::Borrowed("processing tasks"), + BatchProgress::WritingTasksToDisk => Cow::Borrowed("writing tasks to disk"), + } + } + + fn current(&self) -> u32 { + *self as u8 as u32 + } + + fn total(&self) -> u32 { + 2 + } +} + +#[derive(Default)] +pub struct Task {} + +impl NamedStep for Task { + fn name(&self) -> &'static str { + "task" + } +} +pub type AtomicTaskStep = AtomicSubStep; + +#[cfg(test)] +mod test { + use std::sync::atomic::Ordering; + + use meili_snap::{json_string, snapshot}; + + use super::*; + + #[test] + fn one_level() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "processing tasks", + "finished": 0, + "total": 2 + } + ], + "percentage": 0.0 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + } + + #[test] + fn task_progress() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + let (atomic, tasks) = AtomicTaskStep::new(10); + processing.progress.as_ref().unwrap().update_progress(tasks); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "name": "task", + "finished": 0, + "total": 10 + } + ], + "percentage": 0.0 + } + "#); + atomic.fetch_add(6, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "name": "task", + "finished": 6, + "total": 10 + } + ], + "percentage": 30.000002 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + let (atomic, tasks) = AtomicTaskStep::new(5); + processing.progress.as_ref().unwrap().update_progress(tasks); + atomic.fetch_add(4, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "name": "writing tasks to disk", + "finished": 1, + "total": 2 + }, + { + "name": "task", + "finished": 4, + "total": 5 + } + ], + "percentage": 90.0 + } + "#); + } +} diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 356d77b35..3718c69ca 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -134,6 +134,7 @@ impl ProcessingBatch { pub fn to_batch(&self) -> Batch { Batch { uid: self.uid, + progress: None, details: self.details.clone(), stats: self.stats.clone(), started_at: self.started_at, @@ -187,6 +188,7 @@ impl IndexScheduler { &batch.uid, &Batch { uid: batch.uid, + progress: None, details: batch.details, stats: batch.stats, started_at: batch.started_at, @@ -273,7 +275,10 @@ impl IndexScheduler { .into_iter() .map(|batch_id| { if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) { - Ok(processing.batch.as_ref().unwrap().to_batch()) + let mut batch = processing.batch.as_ref().unwrap().to_batch(); + println!("here with progress: {}", processing.progress.is_some()); + batch.progress = processing.get_progress_view(); + Ok(batch) } else { self.get_batch(rtxn, batch_id) .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index 5d800d897..a3d7f834f 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -1,3 +1,4 @@ +use milli::progress::ProgressView; use serde::Serialize; use time::{Duration, OffsetDateTime}; @@ -11,6 +12,7 @@ use crate::{ #[serde(rename_all = "camelCase")] pub struct BatchView { pub uid: BatchId, + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, #[serde(serialize_with = "serialize_duration", default)] @@ -25,6 +27,7 @@ impl BatchView { pub fn from_batch(batch: &Batch) -> Self { Self { uid: batch.uid, + progress: batch.progress.clone(), details: batch.details.clone(), stats: batch.stats.clone(), duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at), diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index a60386e52..57c609320 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -1,5 +1,6 @@ use std::collections::BTreeMap; +use milli::progress::ProgressView; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; @@ -15,6 +16,8 @@ pub type BatchId = u32; pub struct Batch { pub uid: BatchId, + #[serde(skip_deserializing)] + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index ebd28f526..c62f550ae 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -4,7 +4,6 @@ use std::fmt::{Display, Write}; use std::str::FromStr; use enum_iterator::Sequence; -use milli::update::new::indexer::document_changes::Progress; use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; @@ -41,62 +40,6 @@ pub struct Task { pub kind: KindWithContent, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskProgress { - pub current_step: &'static str, - pub finished_steps: u16, - pub total_steps: u16, - pub finished_substeps: Option, - pub total_substeps: Option, -} - -impl Default for TaskProgress { - fn default() -> Self { - Self::new() - } -} - -impl TaskProgress { - pub fn new() -> Self { - Self { - current_step: "start", - finished_steps: 0, - total_steps: 1, - finished_substeps: None, - total_substeps: None, - } - } - - pub fn update(&mut self, progress: Progress) -> TaskProgress { - if self.finished_steps > progress.finished_steps { - return *self; - } - - if self.current_step != progress.step_name { - self.current_step = progress.step_name - } - - self.total_steps = progress.total_steps; - - if self.finished_steps < progress.finished_steps { - self.finished_substeps = None; - self.total_substeps = None; - } - self.finished_steps = progress.finished_steps; - if let Some((finished_substeps, total_substeps)) = progress.finished_total_substep { - if let Some(task_finished_substeps) = self.finished_substeps { - if task_finished_substeps > finished_substeps { - return *self; - } - } - self.finished_substeps = Some(finished_substeps); - self.total_substeps = Some(total_substeps); - } - *self - } -} - impl Task { pub fn index_uid(&self) -> Option<&str> { use KindWithContent::*; diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 268d33cd9..f60b59c72 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1734,6 +1734,7 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + use crate::progress::Progress; use crate::update::new::indexer; use crate::update::settings::InnerIndexSettings; use crate::update::{ @@ -1810,7 +1811,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1829,7 +1830,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1901,7 +1902,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1920,7 +1921,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1982,7 +1983,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2001,7 +2002,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| should_abort.load(Relaxed), - &|_| (), + &Progress::default(), ) }) .unwrap() diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 1fc876f79..3ae0bfdb9 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -31,6 +31,7 @@ pub mod vector; #[macro_use] pub mod snapshot_tests; mod fieldids_weights_map; +pub mod progress; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs new file mode 100644 index 000000000..63f0fbef8 --- /dev/null +++ b/crates/milli/src/progress.rs @@ -0,0 +1,116 @@ +use std::{ + any::TypeId, + borrow::Cow, + sync::{ + atomic::{AtomicU32, Ordering}, + Arc, RwLock, + }, +}; + +use serde::Serialize; + +pub trait Step: 'static + Send + Sync { + fn name(&self) -> Cow<'static, str>; + fn current(&self) -> u32; + fn total(&self) -> u32; +} + +#[derive(Clone, Default)] +pub struct Progress { + steps: Arc)>>>, +} + +impl Progress { + pub fn update_progress(&self, sub_progress: P) { + let mut steps = self.steps.write().unwrap(); + let step_type = TypeId::of::

(); + if let Some(idx) = steps.iter().position(|(id, _)| *id == step_type) { + steps.truncate(idx); + } + steps.push((step_type, Box::new(sub_progress))); + } + + // TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types + pub fn as_progress_view(&self) -> ProgressView { + let steps = self.steps.read().unwrap(); + + let mut percentage = 0.0; + let mut prev_factors = 1.0; + + let mut step_view = Vec::new(); + for (_, step) in steps.iter() { + prev_factors *= step.total() as f32; + percentage += step.current() as f32 / prev_factors; + + step_view.push(ProgressStepView { + name: step.name(), + finished: step.current(), + total: step.total(), + }); + } + + ProgressView { steps: step_view, percentage: percentage * 100.0 } + } +} + +/// This trait lets you use the AtomicSubStep defined right below. +/// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe. +/// By forcing the Default trait + the &'static str we make it harder to miss-use the trait. +pub trait NamedStep: 'static + Send + Sync + Default { + fn name(&self) -> &'static str; +} + +/// Structure to quickly define steps that need very quick, lockless updating of their current step. +/// You can use this struct if: +/// - The name of the step doesn't change +/// - The total number of steps doesn't change +pub struct AtomicSubStep { + name: Name, + current: Arc, + total: u32, +} + +impl AtomicSubStep { + pub fn new(total: u32) -> (Arc, Self) { + let current = Arc::new(AtomicU32::new(0)); + (current.clone(), Self { current, total, name: Name::default() }) + } +} + +impl Step for AtomicSubStep { + fn name(&self) -> Cow<'static, str> { + self.name.name().into() + } + + fn current(&self) -> u32 { + self.current.load(Ordering::Relaxed) + } + + fn total(&self) -> u32 { + self.total + } +} + +#[derive(Default)] +pub struct Document {} + +impl NamedStep for Document { + fn name(&self) -> &'static str { + "document" + } +} + +pub type AtomicDocumentStep = AtomicSubStep; + +#[derive(Debug, Serialize, Clone)] +pub struct ProgressView { + steps: Vec, + percentage: f32, +} + +#[derive(Debug, Serialize, Clone)] +pub struct ProgressStepView { + name: Cow<'static, str>, + finished: u32, + total: u32, +} diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 5db5b400b..04d3b6667 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -5,6 +5,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use crate::vector::EmbeddingConfigs; @@ -72,7 +73,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -91,7 +92,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 3988b311c..bae8e00b4 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -766,6 +766,7 @@ mod tests { use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; use crate::index::IndexEmbeddingConfig; + use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; @@ -1964,7 +1965,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2148,7 +2149,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2163,7 +2164,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2210,7 +2211,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2225,7 +2226,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2263,7 +2264,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2278,7 +2279,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2315,7 +2316,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2330,7 +2331,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2369,7 +2370,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2384,7 +2385,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2428,7 +2429,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2443,7 +2444,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2480,7 +2481,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2495,7 +2496,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2532,7 +2533,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2547,7 +2548,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2726,7 +2727,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2741,7 +2742,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2785,7 +2786,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2800,7 +2801,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2841,7 +2842,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2856,7 +2857,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index b865d0a35..66ed6cbfb 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -16,10 +16,10 @@ use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::extract::perm_json_p; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -373,26 +373,16 @@ fn truncate_str(s: &str) -> &str { impl FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, sender: &FieldIdDocidFacetSender, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let index = indexing_context.index; let rtxn = index.read_txn()?; diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 0bdf31635..4bcb918e4 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -15,23 +15,22 @@ pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; -use super::steps::Step; +use super::indexer::document_changes::{DocumentChanges, IndexingContext}; +use super::steps::IndexingStep; use super::thread_local::{FullySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::Result; pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync; + MSP: Fn() -> bool + Sync; } /// TODO move in permissive json pointer diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 5e85eb1c8..952ee91e4 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -239,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let index = indexing_context.index; let rtxn = index.read_txn()?; diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 05d2406d9..c4240196a 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::BalancedCaches; use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> } pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let rtxn = indexing_context.index.read_txn()?; let stop_words = indexing_context.index.stop_words(&rtxn)?; @@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync { } impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { Self::run_extraction( grenad_parameters, diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 2a5c25525..f2edfb1f3 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -1,4 +1,5 @@ use std::cell::{Cell, RefCell}; +use std::sync::atomic::Ordering; use std::sync::{Arc, RwLock}; use bumpalo::Bump; @@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; +use crate::progress::{AtomicDocumentStep, Progress}; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; @@ -133,10 +135,8 @@ pub struct IndexingContext< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { pub index: &'index Index, pub db_fields_ids_map: &'indexer FieldsIdsMap, @@ -144,7 +144,8 @@ pub struct IndexingContext< pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, pub must_stop_processing: &'indexer MSP, - pub send_progress: &'indexer SP, + // TODO: TAMO: Rename field to progress + pub send_progress: &'indexer Progress, } impl< @@ -152,18 +153,15 @@ impl< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > Copy for IndexingContext< 'fid, // invariant lifetime of fields ids map 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { } @@ -172,18 +170,15 @@ impl< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > Clone for IndexingContext< 'fid, // invariant lifetime of fields ids map 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { fn clone(&self) -> Self { *self @@ -202,7 +197,6 @@ pub fn extract< EX, DC: DocumentChanges<'pl>, MSP, - SP, >( document_changes: &DC, extractor: &EX, @@ -214,17 +208,17 @@ pub fn extract< fields_ids_map_store, must_stop_processing, send_progress, - }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + }: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, - step: Step, + step: IndexingStep, ) -> Result<()> where EX: Extractor<'extractor>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { tracing::trace!("We are resetting the extractor allocators"); + send_progress.update_progress(step); // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); @@ -232,6 +226,8 @@ where } let total_documents = document_changes.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + send_progress.update_progress(progress_step); let pi = document_changes.iter(CHUNK_SIZE); pi.enumerate().try_arc_for_each_try_init( @@ -253,7 +249,7 @@ where } let finished_documents = (finished_documents * CHUNK_SIZE) as u32; - (send_progress)(Progress::from_step_substep(step, finished_documents, total_documents)); + step.store(finished_documents, Ordering::Relaxed); // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -271,32 +267,7 @@ where res }, )?; - - (send_progress)(Progress::from_step_substep(step, total_documents, total_documents)); + step.store(total_documents, Ordering::Relaxed); Ok(()) } - -pub struct Progress { - pub finished_steps: u16, - pub total_steps: u16, - pub step_name: &'static str, - pub finished_total_substep: Option<(u32, u32)>, -} - -impl Progress { - pub fn from_step(step: Step) -> Self { - Self { - finished_steps: step.finished_steps(), - total_steps: Step::total_steps(), - step_name: step.name(), - finished_total_substep: None, - } - } - pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self { - Self { - finished_total_substep: Some((finished_substep, total_substep)), - ..Progress::from_step(step) - } - } -} diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 518786e6f..33e69e49c 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -92,11 +92,12 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; + use crate::progress::Progress; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; - use crate::update::new::steps::Step; + use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::DocumentId; @@ -164,7 +165,7 @@ mod test { doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing: &(|| false), - send_progress: &(|_progress| {}), + send_progress: &Progress::default(), }; for _ in 0..3 { @@ -176,7 +177,7 @@ mod test { context, &mut extractor_allocs, &datastore, - Step::ExtractingDocuments, + IndexingStep::ExtractingDocuments, ) .unwrap(); diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 0b7ec493e..0ce53d5d2 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,3 +1,5 @@ +use std::sync::atomic::Ordering; + use bumpalo::collections::CollectIn; use bumpalo::Bump; use bumparaw_collections::RawMap; @@ -10,11 +12,12 @@ use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; +use crate::progress::{AtomicSubStep, Progress}; use crate::update::new::document::Versions; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; @@ -45,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> { #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")] - pub fn into_changes( + pub fn into_changes( self, indexer: &'pl Bump, index: &Index, @@ -53,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> { primary_key_from_op: Option<&'pl str>, new_fields_ids_map: &mut FieldsIdsMap, must_stop_processing: &MSP, - send_progress: &SP, + progress: Progress, ) -> Result<(DocumentOperationChanges<'pl>, Vec, Option>)> where MSP: Fn() -> bool, - SP: Fn(Progress), { + progress.update_progress(IndexingStep::PreparingPayloads); let Self { operations, method } = self; let documents_ids = index.documents_ids(rtxn)?; @@ -68,16 +71,15 @@ impl<'pl> DocumentOperation<'pl> { let mut primary_key = None; let payload_count = operations.len(); + let (step, progress_step) = + AtomicSubStep::::new(payload_count as u32); + progress.update_progress(progress_step); for (payload_index, operation) in operations.into_iter().enumerate() { if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_index as u32, - payload_count as u32, - )); + step.store(payload_index as u32, Ordering::Relaxed); let mut bytes = 0; let result = match operation { @@ -118,12 +120,7 @@ impl<'pl> DocumentOperation<'pl> { }; operations_stats.push(PayloadStats { document_count, bytes, error }); } - - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_count as u32, - payload_count as u32, - )); + step.store(payload_count as u32, Ordering::Relaxed); // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 601645385..79416bcd5 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -5,7 +5,7 @@ use std::thread::{self, Builder}; use big_s::S; use bumparaw_collections::RawMap; -use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; +use document_changes::{extract, DocumentChanges, IndexingContext}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; @@ -22,7 +22,7 @@ use super::channel::*; use super::extract::*; use super::facet_search_builder::FacetSearchBuilder; use super::merger::FacetFieldIdsDelta; -use super::steps::Step; +use super::steps::IndexingStep; use super::thread_local::ThreadLocal; use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ @@ -33,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; +use crate::progress::Progress; use crate::proximity::ProximityPrecision; use crate::update::del_add::DelAdd; use crate::update::new::extract::EmbeddingExtractor; @@ -60,7 +61,7 @@ mod update_by_function; /// /// TODO return stats #[allow(clippy::too_many_arguments)] // clippy: 😝 -pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( +pub fn index<'pl, 'indexer, 'index, DC, MSP>( wtxn: &mut RwTxn, index: &'index Index, pool: &ThreadPoolNoAbort, @@ -71,12 +72,11 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( document_changes: &DC, embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, - send_progress: &'indexer SP, + send_progress: &'indexer Progress, ) -> Result<()> where DC: DocumentChanges<'pl>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); @@ -159,7 +159,7 @@ where indexing_context, &mut extractor_allocs, &datastore, - Step::ExtractingDocuments, + IndexingStep::ExtractingDocuments, )?; } { @@ -191,7 +191,7 @@ where indexing_context, &mut extractor_allocs, &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets + IndexingStep::ExtractingFacets )? }; @@ -224,7 +224,7 @@ where document_changes, indexing_context, &mut extractor_allocs, - Step::ExtractingWords + IndexingStep::ExtractingWords )? }; @@ -302,7 +302,7 @@ where document_changes, indexing_context, &mut extractor_allocs, - Step::ExtractingWordProximity, + IndexingStep::ExtractingWordProximity, )? }; @@ -338,7 +338,7 @@ where indexing_context, &mut extractor_allocs, &datastore, - Step::ExtractingEmbeddings, + IndexingStep::ExtractingEmbeddings, )?; } { @@ -371,7 +371,7 @@ where indexing_context, &mut extractor_allocs, &datastore, - Step::WritingGeoPoints + IndexingStep::WritingGeoPoints )?; } @@ -383,9 +383,7 @@ where &indexing_context.must_stop_processing, )?; } - - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - + indexing_context.send_progress.update_progress(IndexingStep::WritingToDatabase); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); Result::Ok((facet_field_ids_delta, index_embeddings)) @@ -485,7 +483,7 @@ where )?; } - (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); + indexing_context.send_progress.update_progress(IndexingStep::WaitingForExtractors); let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; @@ -498,10 +496,9 @@ where break 'vectors; } - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); - + indexing_context + .send_progress + .update_progress(IndexingStep::WritingEmbeddingsToDatabase); let mut rng = rand::rngs::StdRng::seed_from_u64(42); for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; @@ -517,21 +514,19 @@ where index.put_embedding_configs(wtxn, index_embeddings)?; } - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); - + indexing_context.send_progress.update_progress(IndexingStep::PostProcessingFacets); if index.facet_search(wtxn)? { compute_facet_search_database(index, wtxn, global_fields_ids_map)?; } compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); - + indexing_context.send_progress.update_progress(IndexingStep::PostProcessingWords); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?; } - (indexing_context.send_progress)(Progress::from_step(Step::Finalizing)); + indexing_context.send_progress.update_progress(IndexingStep::Finalizing); Ok(()) as Result<_> })?; diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index bee1be260..9eb7d376d 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -1,8 +1,12 @@ +use std::borrow::Cow; + use enum_iterator::Sequence; +use crate::progress::Step; + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] -#[repr(u16)] -pub enum Step { +#[repr(u8)] +pub enum IndexingStep { PreparingPayloads, ExtractingDocuments, ExtractingFacets, @@ -18,30 +22,31 @@ pub enum Step { Finalizing, } -impl Step { - pub fn name(&self) -> &'static str { +impl Step for IndexingStep { + fn name(&self) -> Cow<'static, str> { match self { - Step::PreparingPayloads => "preparing update file", - Step::ExtractingDocuments => "extracting documents", - Step::ExtractingFacets => "extracting facets", - Step::ExtractingWords => "extracting words", - Step::ExtractingWordProximity => "extracting word proximity", - Step::ExtractingEmbeddings => "extracting embeddings", - Step::WritingGeoPoints => "writing geo points", - Step::WritingToDatabase => "writing to database", - Step::WaitingForExtractors => "waiting for extractors", - Step::WritingEmbeddingsToDatabase => "writing embeddings to database", - Step::PostProcessingFacets => "post-processing facets", - Step::PostProcessingWords => "post-processing words", - Step::Finalizing => "finalizing", + IndexingStep::PreparingPayloads => "preparing update file", + IndexingStep::ExtractingDocuments => "extracting documents", + IndexingStep::ExtractingFacets => "extracting facets", + IndexingStep::ExtractingWords => "extracting words", + IndexingStep::ExtractingWordProximity => "extracting word proximity", + IndexingStep::ExtractingEmbeddings => "extracting embeddings", + IndexingStep::WritingGeoPoints => "writing geo points", + IndexingStep::WritingToDatabase => "writing to database", + IndexingStep::WaitingForExtractors => "waiting for extractors", + IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", + IndexingStep::PostProcessingFacets => "post-processing facets", + IndexingStep::PostProcessingWords => "post-processing words", + IndexingStep::Finalizing => "finalizing", } + .into() } - pub fn finished_steps(self) -> u16 { - self as u16 + fn current(&self) -> u32 { + *self as u32 } - pub const fn total_steps() -> u16 { - Self::CARDINALITY as u16 + fn total(&self) -> u32 { + Self::CARDINALITY as u32 } } diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 418cdc356..ced81409d 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -3,6 +3,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -57,7 +58,7 @@ fn test_facet_distribution_with_no_facet_values() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -72,7 +73,7 @@ fn test_facet_distribution_with_no_facet_values() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 08b22d7b6..30690969b 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -7,6 +7,7 @@ use bumpalo::Bump; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -90,7 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -109,7 +110,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 8401f0444..304059915 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -5,6 +5,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -326,7 +327,7 @@ fn criteria_ascdesc() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -341,7 +342,7 @@ fn criteria_ascdesc() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index dbee296ee..d33d79e54 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -3,6 +3,7 @@ use std::collections::BTreeSet; use bumpalo::Bump; use heed::EnvOpenOptions; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -135,7 +136,7 @@ fn test_typo_disabled_on_word() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -150,7 +151,7 @@ fn test_typo_disabled_on_word() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); From 6f4823fc9728236bd78c2f09affb7c1b1ae514ff Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 10 Dec 2024 16:58:13 +0100 Subject: [PATCH 131/158] make the number of document in the document tasks more incremental --- crates/milli/src/update/new/indexer/document_changes.rs | 8 +++----- crates/milli/src/update/new/indexer/document_operation.rs | 5 ++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index f2edfb1f3..799763658 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -230,7 +230,7 @@ where send_progress.update_progress(progress_step); let pi = document_changes.iter(CHUNK_SIZE); - pi.enumerate().try_arc_for_each_try_init( + pi.try_arc_for_each_try_init( || { DocumentChangeContext::new( index, @@ -243,13 +243,10 @@ where move |index_alloc| extractor.init_data(index_alloc), ) }, - |context, (finished_documents, items)| { + |context, items| { if (must_stop_processing)() { return Err(Arc::new(InternalError::AbortedIndexation.into())); } - let finished_documents = (finished_documents * CHUNK_SIZE) as u32; - - step.store(finished_documents, Ordering::Relaxed); // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -260,6 +257,7 @@ where }); let res = extractor.process(changes, context).map_err(Arc::new); + step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); // send back the doc_alloc in the pool context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 0ce53d5d2..4418944db 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -15,7 +15,7 @@ use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; -use crate::progress::{AtomicSubStep, Progress}; +use crate::progress::{AtomicDocumentStep, Progress}; use crate::update::new::document::Versions; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; @@ -71,8 +71,7 @@ impl<'pl> DocumentOperation<'pl> { let mut primary_key = None; let payload_count = operations.len(); - let (step, progress_step) = - AtomicSubStep::::new(payload_count as u32); + let (step, progress_step) = AtomicDocumentStep::new(payload_count as u32); progress.update_progress(progress_step); for (payload_index, operation) in operations.into_iter().enumerate() { From 867e6a8f1dc2fc7a1fbc5e351335213f2eb8ea6c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 10 Dec 2024 17:04:04 +0100 Subject: [PATCH 132/158] rename the send_progress field to progress since it s not sending anything --- .../src/update/new/indexer/document_changes.rs | 5 ++--- .../src/update/new/indexer/document_deletion.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 16 +++++++--------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 799763658..3e2b9c036 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -144,8 +144,7 @@ pub struct IndexingContext< pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, pub must_stop_processing: &'indexer MSP, - // TODO: TAMO: Rename field to progress - pub send_progress: &'indexer Progress, + pub progress: &'indexer Progress, } impl< @@ -207,7 +206,7 @@ pub fn extract< doc_allocs, fields_ids_map_store, must_stop_processing, - send_progress, + progress: send_progress, }: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 33e69e49c..b42a6c859 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -165,7 +165,7 @@ mod test { doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing: &(|| false), - send_progress: &Progress::default(), + progress: &Progress::default(), }; for _ in 0..3 { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 79416bcd5..acdf78304 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -125,7 +125,7 @@ where doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing, - send_progress, + progress: send_progress, }; let mut index_embeddings = index.embedding_configs(wtxn)?; @@ -383,7 +383,7 @@ where &indexing_context.must_stop_processing, )?; } - indexing_context.send_progress.update_progress(IndexingStep::WritingToDatabase); + indexing_context.progress.update_progress(IndexingStep::WritingToDatabase); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); Result::Ok((facet_field_ids_delta, index_embeddings)) @@ -483,7 +483,7 @@ where )?; } - indexing_context.send_progress.update_progress(IndexingStep::WaitingForExtractors); + indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; @@ -496,9 +496,7 @@ where break 'vectors; } - indexing_context - .send_progress - .update_progress(IndexingStep::WritingEmbeddingsToDatabase); + indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); let mut rng = rand::rngs::StdRng::seed_from_u64(42); for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; @@ -514,19 +512,19 @@ where index.put_embedding_configs(wtxn, index_embeddings)?; } - indexing_context.send_progress.update_progress(IndexingStep::PostProcessingFacets); + indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); if index.facet_search(wtxn)? { compute_facet_search_database(index, wtxn, global_fields_ids_map)?; } compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - indexing_context.send_progress.update_progress(IndexingStep::PostProcessingWords); + indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?; } - indexing_context.send_progress.update_progress(IndexingStep::Finalizing); + indexing_context.progress.update_progress(IndexingStep::Finalizing); Ok(()) as Result<_> })?; From ab75f53efdd2f408d95a9bfa187ad5ade93a0e7d Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 10 Dec 2024 17:09:10 +0100 Subject: [PATCH 133/158] update all snapshots --- crates/index-scheduler/src/lib.rs | 10 ++++++++++ crates/meilisearch/tests/batches/mod.rs | 21 +++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index f5f73087d..d3e65c6f8 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4308,6 +4308,16 @@ mod tests { snapshot!(batch, @r#" { "uid": 0, + "progress": { + "steps": [ + { + "name": "processing tasks", + "finished": 0, + "total": 2 + } + ], + "percentage": 0.0 + }, "details": { "primaryKey": "mouse" }, diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index 9c869c140..581e92837 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -284,6 +284,7 @@ async fn test_summarized_document_addition_or_update() { @r#" { "uid": 0, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -314,6 +315,7 @@ async fn test_summarized_document_addition_or_update() { @r#" { "uid": 1, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -349,6 +351,7 @@ async fn test_summarized_delete_documents_by_batch() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 3, "deletedDocuments": 0 @@ -380,6 +383,7 @@ async fn test_summarized_delete_documents_by_batch() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -416,6 +420,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -448,6 +453,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -480,6 +486,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 4, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -516,6 +523,7 @@ async fn test_summarized_delete_document_by_id() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -547,6 +555,7 @@ async fn test_summarized_delete_document_by_id() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -594,6 +603,7 @@ async fn test_summarized_settings_update() { @r#" { "uid": 0, + "progress": null, "details": { "displayedAttributes": [ "doggos", @@ -638,6 +648,7 @@ async fn test_summarized_index_creation() { @r#" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -665,6 +676,7 @@ async fn test_summarized_index_creation() { @r#" { "uid": 1, + "progress": null, "details": { "primaryKey": "doggos" }, @@ -809,6 +821,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -836,6 +849,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 1, + "progress": null, "details": { "primaryKey": "bones" }, @@ -868,6 +882,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 3, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -895,6 +910,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 4, + "progress": null, "details": { "primaryKey": "bones" }, @@ -932,6 +948,7 @@ async fn test_summarized_index_swap() { @r#" { "uid": 0, + "progress": null, "details": { "swaps": [ { @@ -972,6 +989,7 @@ async fn test_summarized_index_swap() { @r#" { "uid": 3, + "progress": null, "details": { "swaps": [ { @@ -1014,6 +1032,7 @@ async fn test_summarized_batch_cancelation() { @r#" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "canceledTasks": 0, @@ -1051,6 +1070,7 @@ async fn test_summarized_batch_deletion() { @r#" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "deletedTasks": 1, @@ -1084,6 +1104,7 @@ async fn test_summarized_dump_creation() { @r#" { "uid": 0, + "progress": null, "details": { "dumpUid": "[dumpUid]" }, From 26733c705d55be4788c9a513b3654d71498679ff Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 10 Dec 2024 22:29:31 +0100 Subject: [PATCH 134/158] add progress for the task deletion and task cancelation --- Cargo.lock | 1 + crates/index-scheduler/Cargo.toml | 1 + crates/index-scheduler/src/batch.rs | 57 +++++++++++++-- crates/index-scheduler/src/processing.rs | 88 ++++++++++++++++-------- 4 files changed, 115 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index de7dabc36..91c83fb13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2632,6 +2632,7 @@ dependencies = [ "bincode", "bumpalo", "bumparaw-collections", + "convert_case 0.6.0", "crossbeam-channel", "csv", "derive_builder 0.20.0", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 5d7eb1913..ec2f17f84 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -15,6 +15,7 @@ anyhow = "1.0.86" bincode = "1.3.3" bumpalo = "3.16.0" bumparaw-collections = "0.1.2" +convert_case = "0.6.0" csv = "1.3.0" derive_builder = "0.20.0" dump = { path = "../dump" } diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 1bfa7f53b..fe055b185 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -22,6 +22,7 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; +use std::sync::atomic::Ordering; use bumpalo::collections::CollectIn; use bumpalo::Bump; @@ -48,6 +49,9 @@ use time::OffsetDateTime; use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; +use crate::processing::{ + AtomicBatchStep, AtomicTaskStep, TaskCancelationProgress, TaskDeletionProgress, +}; use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch}; use crate::{Error, IndexScheduler, Result, TaskId}; @@ -583,8 +587,13 @@ impl IndexScheduler { }; let rtxn = self.env.read_txn()?; - let mut canceled_tasks = - self.cancel_matched_tasks(&rtxn, task.uid, current_batch, matched_tasks)?; + let mut canceled_tasks = self.cancel_matched_tasks( + &rtxn, + task.uid, + current_batch, + matched_tasks, + &progress, + )?; task.status = Status::Succeeded; match &mut task.details { @@ -615,7 +624,8 @@ impl IndexScheduler { } let mut wtxn = self.env.write_txn()?; - let mut deleted_tasks = self.delete_matched_tasks(&mut wtxn, &matched_tasks)?; + let mut deleted_tasks = + self.delete_matched_tasks(&mut wtxn, &matched_tasks, &progress)?; wtxn.commit()?; for task in tasks.iter_mut() { @@ -1664,7 +1674,10 @@ impl IndexScheduler { &self, wtxn: &mut RwTxn, matched_tasks: &RoaringBitmap, + progress: &Progress, ) -> Result { + progress.update_progress(TaskDeletionProgress::DeletingTasksDateTime); + // 1. Remove from this list the tasks that we are not allowed to delete let enqueued_tasks = self.get_status(wtxn, Status::Enqueued)?; let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone(); @@ -1683,6 +1696,8 @@ impl IndexScheduler { // The tasks that have been removed *per batches*. let mut affected_batches: HashMap = HashMap::new(); + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); for task_id in to_delete_tasks.iter() { let task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; @@ -1706,22 +1721,35 @@ impl IndexScheduler { if let Some(batch_uid) = task.batch_uid { affected_batches.entry(batch_uid).or_default().insert(task_id); } + atomic_progress.fetch_add(1, Ordering::Relaxed); } + progress.update_progress(TaskDeletionProgress::DeletingTasksMetadata); + let (atomic_progress, task_progress) = AtomicTaskStep::new( + (affected_indexes.len() + affected_statuses.len() + affected_kinds.len()) as u32, + ); + progress.update_progress(task_progress); for index in affected_indexes.iter() { self.update_index(wtxn, index, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for status in affected_statuses.iter() { self.update_status(wtxn, *status, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for kind in affected_kinds.iter() { self.update_kind(wtxn, *kind, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } + progress.update_progress(TaskDeletionProgress::DeletingTasks); + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); for task in to_delete_tasks.iter() { self.all_tasks.delete(wtxn, &task)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for canceled_by in affected_canceled_by { if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? { @@ -1733,6 +1761,9 @@ impl IndexScheduler { } } } + progress.update_progress(TaskDeletionProgress::DeletingBatches); + let (atomic_progress, batch_progress) = AtomicBatchStep::new(affected_batches.len() as u32); + progress.update_progress(batch_progress); for (batch_id, to_delete_tasks) in affected_batches { if let Some(mut tasks) = self.batch_to_tasks_mapping.get(wtxn, &batch_id)? { tasks -= &to_delete_tasks; @@ -1774,6 +1805,7 @@ impl IndexScheduler { } } } + atomic_progress.fetch_add(1, Ordering::Relaxed); } Ok(to_delete_tasks) @@ -1788,21 +1820,36 @@ impl IndexScheduler { cancel_task_id: TaskId, current_batch: &mut ProcessingBatch, matched_tasks: &RoaringBitmap, + progress: &Progress, ) -> Result> { + progress.update_progress(TaskCancelationProgress::RetrievingTasks); + // 1. Remove from this list the tasks that we are not allowed to cancel // Notice that only the _enqueued_ ones are cancelable and we should // have already aborted the indexation of the _processing_ ones let cancelable_tasks = self.get_status(rtxn, Status::Enqueued)?; let tasks_to_cancel = cancelable_tasks & matched_tasks; - // 2. We now have a list of tasks to cancel, cancel them - let mut tasks = self.get_existing_tasks(rtxn, tasks_to_cancel.iter())?; + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); + // 2. We now have a list of tasks to cancel, cancel them + let mut tasks = self.get_existing_tasks( + rtxn, + tasks_to_cancel.iter().inspect(|_| { + task_progress.fetch_add(1, Ordering::Relaxed); + }), + )?; + + progress.update_progress(TaskCancelationProgress::UpdatingTasks); + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); for task in tasks.iter_mut() { task.status = Status::Canceled; task.canceled_by = Some(cancel_task_id); task.details = task.details.as_ref().map(|d| d.to_failed()); current_batch.processing(Some(task)); + task_progress.fetch_add(1, Ordering::Relaxed); } Ok(tasks) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index e5e892927..f28fa0219 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -1,4 +1,5 @@ use crate::utils::ProcessingBatch; +use enum_iterator::Sequence; use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}; use roaring::RoaringBitmap; use std::{borrow::Cow, sync::Arc}; @@ -54,39 +55,72 @@ impl ProcessingTasks { } } -#[repr(u8)] -#[derive(Copy, Clone)] -pub enum BatchProgress { - ProcessingTasks, - WritingTasksToDisk, -} - -impl Step for BatchProgress { - fn name(&self) -> Cow<'static, str> { - match self { - BatchProgress::ProcessingTasks => Cow::Borrowed("processing tasks"), - BatchProgress::WritingTasksToDisk => Cow::Borrowed("writing tasks to disk"), +macro_rules! make_enum_progress { + (enum $name:ident: $(- $variant:ident)+ ) => { + #[repr(u8)] + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] + #[allow(clippy::enum_variant_names)] + pub enum $name { + $($variant),+ } - } - fn current(&self) -> u32 { - *self as u8 as u32 - } + impl Step for $name { + fn name(&self) -> Cow<'static, str> { + use convert_case::Casing; - fn total(&self) -> u32 { - 2 - } + match self { + $( + $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() + ),+ + } + } + + fn current(&self) -> u32 { + *self as u32 + } + + fn total(&self) -> u32 { + Self::CARDINALITY as u32 + } + } + }; } -#[derive(Default)] -pub struct Task {} - -impl NamedStep for Task { - fn name(&self) -> &'static str { - "task" - } +macro_rules! make_atomic_progress { + ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { + #[derive(Default, Debug, Clone, Copy)] + pub struct $struct_name {} + impl NamedStep for $struct_name { + fn name(&self) -> &'static str { + $step_name + } + } + pub type $atomic_struct_name = AtomicSubStep<$struct_name>; + }; } -pub type AtomicTaskStep = AtomicSubStep; + +make_enum_progress! { + enum BatchProgress: + - ProcessingTasks + - WritingTasksToDisk +} + +make_enum_progress! { + enum TaskCancelationProgress: + - RetrievingTasks + - UpdatingTasks +} + +make_enum_progress! { + enum TaskDeletionProgress: + - DeletingTasksDateTime + - DeletingTasksMetadata + - DeletingTasks + - DeletingBatches +} + +make_atomic_progress!(Task alias AtomicTaskStep => "task" ); +make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); #[cfg(test)] mod test { From 786b0fabea2a979442923f32ffbecc8208671cf9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 16:18:12 +0100 Subject: [PATCH 135/158] implement the progress for almost all the tasks --- crates/index-scheduler/src/batch.rs | 106 +++++++++++++++++++++-- crates/index-scheduler/src/processing.rs | 103 ++++++++++++++++++++++ 2 files changed, 203 insertions(+), 6 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index fe055b185..733984043 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -50,7 +50,11 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, TaskCancelationProgress, TaskDeletionProgress, + AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, AtomicUpdateFileStep, CreateIndexProgress, + DeleteIndexProgress, DocumentDeletionProgress, DocumentEditionProgress, + DocumentOperationProgress, DumpCreationProgress, InnerSwappingTwoIndexes, SettingsProgress, + SnapshotCreationProgress, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, VariableNameStep, }; use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch}; use crate::{Error, IndexScheduler, Result, TaskId}; @@ -651,6 +655,8 @@ impl IndexScheduler { Ok(tasks) } Batch::SnapshotCreation(mut tasks) => { + progress.update_progress(SnapshotCreationProgress::StartTheSnapshotCreation); + fs::create_dir_all(&self.snapshots_path)?; let temp_snapshot_dir = tempfile::tempdir()?; @@ -671,6 +677,7 @@ impl IndexScheduler { // two read operations as the task processing is synchronous. // 2.1 First copy the LMDB env of the index-scheduler + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler); let dst = temp_snapshot_dir.path().join("tasks"); fs::create_dir_all(&dst)?; self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; @@ -683,6 +690,11 @@ impl IndexScheduler { fs::create_dir_all(&update_files_dir)?; // 2.4 Only copy the update files of the enqueued tasks + progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles); + let enqueued = self.get_status(&rtxn, Status::Enqueued)?; + let (atomic, update_file_progress) = + AtomicUpdateFileStep::new(enqueued.len() as u32); + progress.update_progress(update_file_progress); for task_id in self.get_status(&rtxn, Status::Enqueued)? { let task = self.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; if let Some(content_uuid) = task.content_uuid() { @@ -690,11 +702,17 @@ impl IndexScheduler { let dst = update_files_dir.join(content_uuid.to_string()); fs::copy(src, dst)?; } + atomic.fetch_add(1, Ordering::Relaxed); } // 3. Snapshot every indexes - for result in self.index_mapper.index_mapping.iter(&rtxn)? { + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes); + let index_mapping = self.index_mapper.index_mapping; + let nb_indexes = index_mapping.len(&rtxn)? as u32; + + for (i, result) in index_mapping.iter(&rtxn)?.enumerate() { let (name, uuid) = result?; + progress.update_progress(VariableNameStep::new(name, i as u32, nb_indexes)); let index = self.index_mapper.index(&rtxn, name)?; let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); fs::create_dir_all(&dst)?; @@ -706,6 +724,7 @@ impl IndexScheduler { drop(rtxn); // 4. Snapshot the auth LMDB env + progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys); let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should @@ -718,6 +737,7 @@ impl IndexScheduler { auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot + progress.update_progress(SnapshotCreationProgress::CreateTheTarball); // 5.1 Find the original name of the database // TODO find a better way to get this path let mut base_path = self.env.path().to_owned(); @@ -750,6 +770,7 @@ impl IndexScheduler { Ok(tasks) } Batch::Dump(mut task) => { + progress.update_progress(DumpCreationProgress::StartTheDumpCreation); let started_at = OffsetDateTime::now_utc(); let (keys, instance_uid) = if let KindWithContent::DumpCreation { keys, instance_uid } = &task.kind { @@ -760,6 +781,7 @@ impl IndexScheduler { let dump = dump::DumpWriter::new(*instance_uid)?; // 1. dump the keys + progress.update_progress(DumpCreationProgress::DumpTheApiKeys); let mut dump_keys = dump.create_keys()?; for key in keys { dump_keys.push_key(key)?; @@ -769,7 +791,13 @@ impl IndexScheduler { let rtxn = self.env.read_txn()?; // 2. dump the tasks + progress.update_progress(DumpCreationProgress::DumpTheTasks); let mut dump_tasks = dump.create_tasks_queue()?; + + let (atomic, update_task_progress) = + AtomicTaskStep::new(self.all_tasks.len(&rtxn)? as u32); + progress.update_progress(update_task_progress); + for ret in self.all_tasks.iter(&rtxn)? { if self.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -819,11 +847,22 @@ impl IndexScheduler { dump_content_file.flush()?; } } + atomic.fetch_add(1, Ordering::Relaxed); } dump_tasks.flush()?; // 3. Dump the indexes + progress.update_progress(DumpCreationProgress::DumpTheIndexes); + let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; + let mut count = 0; self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> { + progress.update_progress(VariableNameStep::new( + uid.to_string(), + count, + nb_indexes, + )); + count += 1; + let rtxn = index.read_txn()?; let metadata = IndexMetadata { uid: uid.to_owned(), @@ -843,6 +882,12 @@ impl IndexScheduler { .embedding_configs(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let nb_documents = index + .number_of_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + as u32; + let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents); + progress.update_progress(update_document_progress); let documents = index .all_documents(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; @@ -912,6 +957,7 @@ impl IndexScheduler { } index_dumper.push_document(&document)?; + atomic.fetch_add(1, Ordering::Relaxed); } // 3.2. Dump the settings @@ -926,6 +972,7 @@ impl IndexScheduler { })?; // 4. Dump experimental feature settings + progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); let features = self.features().runtime_features(); dump.create_experimental_features(features)?; @@ -936,6 +983,7 @@ impl IndexScheduler { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } + progress.update_progress(DumpCreationProgress::CompressTheDump); let path = self.dumps_path.join(format!("{}.dump", dump_uid)); let file = File::create(path)?; dump.persist_to(BufWriter::new(file))?; @@ -995,6 +1043,8 @@ impl IndexScheduler { Ok(tasks) } Batch::IndexCreation { index_uid, primary_key, task } => { + progress.update_progress(CreateIndexProgress::CreatingTheIndex); + let wtxn = self.env.write_txn()?; if self.index_mapper.exists(&wtxn, &index_uid)? { return Err(Error::IndexAlreadyExists(index_uid)); @@ -1008,6 +1058,7 @@ impl IndexScheduler { ) } Batch::IndexUpdate { index_uid, primary_key, mut task } => { + progress.update_progress(UpdateIndexProgress::UpdatingTheIndex); let rtxn = self.env.read_txn()?; let index = self.index_mapper.index(&rtxn, &index_uid)?; @@ -1060,6 +1111,7 @@ impl IndexScheduler { Ok(vec![task]) } Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { + progress.update_progress(DeleteIndexProgress::DeletingTheIndex); let wtxn = self.env.write_txn()?; // it's possible that the index doesn't exist @@ -1093,6 +1145,8 @@ impl IndexScheduler { Ok(tasks) } Batch::IndexSwap { mut task } => { + progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap); + let mut wtxn = self.env.write_txn()?; let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind { swaps @@ -1119,8 +1173,20 @@ impl IndexScheduler { )); } } - for swap in swaps { - self.apply_index_swap(&mut wtxn, task.uid, &swap.indexes.0, &swap.indexes.1)?; + progress.update_progress(SwappingTheIndexes::SwappingTheIndexes); + for (step, swap) in swaps.iter().enumerate() { + progress.update_progress(VariableNameStep::new( + format!("swapping index {} and {}", swap.indexes.0, swap.indexes.1), + step as u32, + swaps.len() as u32, + )); + self.apply_index_swap( + &mut wtxn, + &progress, + task.uid, + &swap.indexes.0, + &swap.indexes.1, + )?; } wtxn.commit()?; task.status = Status::Succeeded; @@ -1130,7 +1196,15 @@ impl IndexScheduler { } /// Swap the index `lhs` with the index `rhs`. - fn apply_index_swap(&self, wtxn: &mut RwTxn, task_id: u32, lhs: &str, rhs: &str) -> Result<()> { + fn apply_index_swap( + &self, + wtxn: &mut RwTxn, + progress: &Progress, + task_id: u32, + lhs: &str, + rhs: &str, + ) -> Result<()> { + progress.update_progress(InnerSwappingTwoIndexes::RetrieveTheTasks); // 1. Verify that both lhs and rhs are existing indexes let index_lhs_exists = self.index_mapper.index_exists(wtxn, lhs)?; if !index_lhs_exists { @@ -1148,14 +1222,21 @@ impl IndexScheduler { index_rhs_task_ids.remove_range(task_id..); // 3. before_name -> new_name in the task's KindWithContent - for task_id in &index_lhs_task_ids | &index_rhs_task_ids { + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheTasks); + let tasks_to_update = &index_lhs_task_ids | &index_rhs_task_ids; + let (atomic, task_progress) = AtomicTaskStep::new(tasks_to_update.len() as u32); + progress.update_progress(task_progress); + + for task_id in tasks_to_update { let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; swap_index_uid_in_task(&mut task, (lhs, rhs)); self.all_tasks.put(wtxn, &task_id, &task)?; + atomic.fetch_add(1, Ordering::Relaxed); } // 4. remove the task from indexuid = before_name // 5. add the task to indexuid = after_name + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheIndexesMetadata); self.update_index(wtxn, lhs, |lhs_tasks| { *lhs_tasks -= &index_lhs_task_ids; *lhs_tasks |= &index_rhs_task_ids; @@ -1222,6 +1303,7 @@ impl IndexScheduler { operations, mut tasks, } => { + progress.update_progress(DocumentOperationProgress::RetrievingConfig); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. // this is made difficult by the fact we're doing private clones of the index scheduler and sending it // to a fresh thread. @@ -1277,6 +1359,7 @@ impl IndexScheduler { } }; + progress.update_progress(DocumentOperationProgress::ComputingTheChanges); let (document_changes, operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, @@ -1321,6 +1404,7 @@ impl IndexScheduler { } } + progress.update_progress(DocumentOperationProgress::Indexing); if tasks.iter().any(|res| res.error.is_none()) { indexer::index( index_wtxn, @@ -1350,6 +1434,8 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentEdition { index_uid, mut task } => { + progress.update_progress(DocumentEditionProgress::RetrievingConfig); + let (filter, code) = if let KindWithContent::DocumentEdition { filter_expr, context: _, @@ -1423,6 +1509,7 @@ impl IndexScheduler { }; let candidates_count = candidates.len(); + progress.update_progress(DocumentEditionProgress::ComputingTheChanges); let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); let document_changes = pool .install(|| { @@ -1436,6 +1523,7 @@ impl IndexScheduler { .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; + progress.update_progress(DocumentEditionProgress::Indexing); indexer::index( index_wtxn, index, @@ -1488,6 +1576,8 @@ impl IndexScheduler { Ok(vec![task]) } IndexOperation::DocumentDeletion { mut tasks, index_uid } => { + progress.update_progress(DocumentDeletionProgress::RetrievingConfig); + let mut to_delete = RoaringBitmap::new(); let external_documents_ids = index.external_documents_ids(); @@ -1578,6 +1668,7 @@ impl IndexScheduler { } }; + progress.update_progress(DocumentDeletionProgress::DeleteDocuments); let mut indexer = indexer::DocumentDeletion::new(); let candidates_count = to_delete.len(); indexer.delete_documents_by_docids(to_delete); @@ -1587,6 +1678,7 @@ impl IndexScheduler { .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; + progress.update_progress(DocumentDeletionProgress::Indexing); indexer::index( index_wtxn, index, @@ -1615,6 +1707,7 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::Settings { index_uid, settings, mut tasks } => { + progress.update_progress(SettingsProgress::RetrievingAndMergingTheSettings); let indexer_config = self.index_mapper.indexer_config(); let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); @@ -1628,6 +1721,7 @@ impl IndexScheduler { task.status = Status::Succeeded; } + progress.update_progress(SettingsProgress::ApplyTheSettings); builder .execute( |indexing_step| tracing::debug!(update = ?indexing_step), diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index f28fa0219..479b6274f 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -119,8 +119,111 @@ make_enum_progress! { - DeletingBatches } +make_enum_progress! { + enum SnapshotCreationProgress: + - StartTheSnapshotCreation + - SnapshotTheIndexScheduler + - SnapshotTheUpdateFiles + - SnapshotTheIndexes + - SnapshotTheApiKeys + - CreateTheTarball +} + +make_enum_progress! { + enum DumpCreationProgress: + - StartTheDumpCreation + - DumpTheApiKeys + - DumpTheTasks + - DumpTheIndexes + - DumpTheExperimentalFeatures + - CompressTheDump +} + +make_enum_progress! { + enum CreateIndexProgress: + - CreatingTheIndex +} + +make_enum_progress! { + enum UpdateIndexProgress: + - UpdatingTheIndex +} + +make_enum_progress! { + enum DeleteIndexProgress: + - DeletingTheIndex +} + +make_enum_progress! { + enum SwappingTheIndexes: + - EnsuringCorrectnessOfTheSwap + - SwappingTheIndexes +} + +make_enum_progress! { + enum InnerSwappingTwoIndexes: + - RetrieveTheTasks + - UpdateTheTasks + - UpdateTheIndexesMetadata +} + +make_enum_progress! { + enum DocumentOperationProgress: + - RetrievingConfig + - ComputingTheChanges + - Indexing +} + +make_enum_progress! { + enum DocumentEditionProgress: + - RetrievingConfig + - ComputingTheChanges + - Indexing +} + +make_enum_progress! { + enum DocumentDeletionProgress: + - RetrievingConfig + - DeleteDocuments + - Indexing +} + +make_enum_progress! { + enum SettingsProgress: + - RetrievingAndMergingTheSettings + - ApplyTheSettings +} + make_atomic_progress!(Task alias AtomicTaskStep => "task" ); +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); +make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); + +pub struct VariableNameStep { + name: String, + current: u32, + total: u32, +} + +impl VariableNameStep { + pub fn new(name: impl Into, current: u32, total: u32) -> Self { + Self { name: name.into(), current, total } + } +} + +impl Step for VariableNameStep { + fn name(&self) -> Cow<'static, str> { + self.name.clone().into() + } + + fn current(&self) -> u32 { + self.current + } + + fn total(&self) -> u32 { + self.total + } +} #[cfg(test)] mod test { From 1f54dfa883adf86164ecc585561c01f55cfefde8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 16:23:21 +0100 Subject: [PATCH 136/158] update the macro to look more like an enum --- crates/index-scheduler/src/processing.rs | 124 +++++++++++++---------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 479b6274f..0bc449199 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -56,11 +56,11 @@ impl ProcessingTasks { } macro_rules! make_enum_progress { - (enum $name:ident: $(- $variant:ident)+ ) => { + ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { #[repr(u8)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] #[allow(clippy::enum_variant_names)] - pub enum $name { + $visibility enum $name { $($variant),+ } @@ -100,98 +100,112 @@ macro_rules! make_atomic_progress { } make_enum_progress! { - enum BatchProgress: - - ProcessingTasks - - WritingTasksToDisk + pub enum BatchProgress { + ProcessingTasks, + WritingTasksToDisk, + } } make_enum_progress! { - enum TaskCancelationProgress: - - RetrievingTasks - - UpdatingTasks + pub enum TaskCancelationProgress { + RetrievingTasks, + UpdatingTasks, + } } make_enum_progress! { - enum TaskDeletionProgress: - - DeletingTasksDateTime - - DeletingTasksMetadata - - DeletingTasks - - DeletingBatches + pub enum TaskDeletionProgress { + DeletingTasksDateTime, + DeletingTasksMetadata, + DeletingTasks, + DeletingBatches, + } } make_enum_progress! { - enum SnapshotCreationProgress: - - StartTheSnapshotCreation - - SnapshotTheIndexScheduler - - SnapshotTheUpdateFiles - - SnapshotTheIndexes - - SnapshotTheApiKeys - - CreateTheTarball + pub enum SnapshotCreationProgress { + StartTheSnapshotCreation, + SnapshotTheIndexScheduler, + SnapshotTheUpdateFiles, + SnapshotTheIndexes, + SnapshotTheApiKeys, + CreateTheTarball, + } } make_enum_progress! { - enum DumpCreationProgress: - - StartTheDumpCreation - - DumpTheApiKeys - - DumpTheTasks - - DumpTheIndexes - - DumpTheExperimentalFeatures - - CompressTheDump + pub enum DumpCreationProgress { + StartTheDumpCreation, + DumpTheApiKeys, + DumpTheTasks, + DumpTheIndexes, + DumpTheExperimentalFeatures, + CompressTheDump, + } } make_enum_progress! { - enum CreateIndexProgress: - - CreatingTheIndex + pub enum CreateIndexProgress { + CreatingTheIndex, + } } make_enum_progress! { - enum UpdateIndexProgress: - - UpdatingTheIndex + pub enum UpdateIndexProgress { + UpdatingTheIndex, + } } make_enum_progress! { - enum DeleteIndexProgress: - - DeletingTheIndex + pub enum DeleteIndexProgress { + DeletingTheIndex, + } } make_enum_progress! { - enum SwappingTheIndexes: - - EnsuringCorrectnessOfTheSwap - - SwappingTheIndexes + pub enum SwappingTheIndexes { + EnsuringCorrectnessOfTheSwap, + SwappingTheIndexes, + } } make_enum_progress! { - enum InnerSwappingTwoIndexes: - - RetrieveTheTasks - - UpdateTheTasks - - UpdateTheIndexesMetadata + pub enum InnerSwappingTwoIndexes { + RetrieveTheTasks, + UpdateTheTasks, + UpdateTheIndexesMetadata, + } } make_enum_progress! { - enum DocumentOperationProgress: - - RetrievingConfig - - ComputingTheChanges - - Indexing + pub enum DocumentOperationProgress { + RetrievingConfig, + ComputingTheChanges, + Indexing, + } } make_enum_progress! { - enum DocumentEditionProgress: - - RetrievingConfig - - ComputingTheChanges - - Indexing + pub enum DocumentEditionProgress { + RetrievingConfig, + ComputingTheChanges, + Indexing, + } } make_enum_progress! { - enum DocumentDeletionProgress: - - RetrievingConfig - - DeleteDocuments - - Indexing + pub enum DocumentDeletionProgress { + RetrievingConfig, + DeleteDocuments, + Indexing, + } } make_enum_progress! { - enum SettingsProgress: - - RetrievingAndMergingTheSettings - - ApplyTheSettings + pub enum SettingsProgress { + RetrievingAndMergingTheSettings, + ApplyTheSettings, + } } make_atomic_progress!(Task alias AtomicTaskStep => "task" ); From 04a24a9239a8fbcd5e45bd2f154ef20e7ef91f59 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 11 Dec 2024 16:27:07 +0100 Subject: [PATCH 137/158] Kill Meilisearch with a TERM signal --- crates/meilisearch/src/main.rs | 5 +++ crates/xtask/src/bench/meili_process.rs | 49 +++++++++++++++++++++---- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index b4b46bec4..6e6245c78 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -129,6 +129,11 @@ async fn try_main() -> anyhow::Result<()> { print_launch_resume(&opt, analytics.clone(), config_read_from); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.unwrap(); + std::process::exit(77); + }); + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) diff --git a/crates/xtask/src/bench/meili_process.rs b/crates/xtask/src/bench/meili_process.rs index db787e595..2aff679fc 100644 --- a/crates/xtask/src/bench/meili_process.rs +++ b/crates/xtask/src/bench/meili_process.rs @@ -1,23 +1,56 @@ use std::collections::BTreeMap; +use std::time::Duration; use anyhow::{bail, Context as _}; +use tokio::process::Command; +use tokio::time; use super::assets::Asset; use super::client::Client; use super::workload::Workload; pub async fn kill(mut meilisearch: tokio::process::Child) { - if let Err(error) = meilisearch.kill().await { - tracing::warn!( - error = &error as &dyn std::error::Error, - "while terminating Meilisearch server" - ) + let Some(id) = meilisearch.id() else { return }; + + match Command::new("kill").args(["--signal=TERM", &id.to_string()]).spawn() { + Ok(mut cmd) => { + let Err(error) = cmd.wait().await else { return }; + tracing::warn!( + error = &error as &dyn std::error::Error, + "while awaiting the Meilisearch server kill" + ); + } + Err(error) => { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server with a kill -s TERM" + ); + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + return; + } + }; + + match time::timeout(Duration::from_secs(5), meilisearch.wait()).await { + Ok(_) => (), + Err(_) => { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + } } } #[tracing::instrument] pub async fn build() -> anyhow::Result<()> { - let mut command = tokio::process::Command::new("cargo"); + let mut command = Command::new("cargo"); command.arg("build").arg("--release").arg("-p").arg("meilisearch"); command.kill_on_drop(true); @@ -37,7 +70,7 @@ pub async fn start( master_key: Option<&str>, workload: &Workload, asset_folder: &str, - mut command: tokio::process::Command, + mut command: Command, ) -> anyhow::Result { command.arg("--db-path").arg("./_xtask_benchmark.ms"); if let Some(master_key) = master_key { @@ -77,7 +110,7 @@ async fn wait_for_health( return Ok(()); } - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + time::sleep(Duration::from_millis(500)).await; // check whether the Meilisearch instance exited early (cut the wait) if let Some(exit_code) = meilisearch.try_wait().context("cannot check Meilisearch server process status")? From 9245c89cfef168fdf4f53a3424d4ed79aae756ab Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:00:46 +0100 Subject: [PATCH 138/158] move the macros to milli --- crates/index-scheduler/src/processing.rs | 49 ++--------------------- crates/milli/src/progress.rs | 51 ++++++++++++++++++++---- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 0bc449199..5212433ef 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -1,6 +1,9 @@ use crate::utils::ProcessingBatch; use enum_iterator::Sequence; -use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}; +use meilisearch_types::milli::{ + make_atomic_progress, make_enum_progress, + progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}, +}; use roaring::RoaringBitmap; use std::{borrow::Cow, sync::Arc}; @@ -55,50 +58,6 @@ impl ProcessingTasks { } } -macro_rules! make_enum_progress { - ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { - #[repr(u8)] - #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] - #[allow(clippy::enum_variant_names)] - $visibility enum $name { - $($variant),+ - } - - impl Step for $name { - fn name(&self) -> Cow<'static, str> { - use convert_case::Casing; - - match self { - $( - $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() - ),+ - } - } - - fn current(&self) -> u32 { - *self as u32 - } - - fn total(&self) -> u32 { - Self::CARDINALITY as u32 - } - } - }; -} - -macro_rules! make_atomic_progress { - ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { - #[derive(Default, Debug, Clone, Copy)] - pub struct $struct_name {} - impl NamedStep for $struct_name { - fn name(&self) -> &'static str { - $step_name - } - } - pub type $atomic_struct_name = AtomicSubStep<$struct_name>; - }; -} - make_enum_progress! { pub enum BatchProgress { ProcessingTasks, diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 63f0fbef8..40a943bd3 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -91,16 +91,53 @@ impl Step for AtomicSubStep { } } -#[derive(Default)] -pub struct Document {} +#[macro_export] +macro_rules! make_enum_progress { + ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { + #[repr(u8)] + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] + #[allow(clippy::enum_variant_names)] + $visibility enum $name { + $($variant),+ + } -impl NamedStep for Document { - fn name(&self) -> &'static str { - "document" - } + impl Step for $name { + fn name(&self) -> Cow<'static, str> { + use convert_case::Casing; + + match self { + $( + $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() + ),+ + } + } + + fn current(&self) -> u32 { + *self as u32 + } + + fn total(&self) -> u32 { + Self::CARDINALITY as u32 + } + } + }; } -pub type AtomicDocumentStep = AtomicSubStep; +#[macro_export] +macro_rules! make_atomic_progress { + ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { + #[derive(Default, Debug, Clone, Copy)] + pub struct $struct_name {} + impl NamedStep for $struct_name { + fn name(&self) -> &'static str { + $step_name + } + } + pub type $atomic_struct_name = AtomicSubStep<$struct_name>; + }; +} + +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); #[derive(Debug, Serialize, Clone)] pub struct ProgressView { From c5536c37b59e0efaa6dcd7bc04d07b7bd8696f3c Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:03:06 +0100 Subject: [PATCH 139/158] rename the atomic::name to unit_name --- crates/milli/src/progress.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 40a943bd3..6a4231e91 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -65,7 +65,7 @@ pub trait NamedStep: 'static + Send + Sync + Default { /// - The name of the step doesn't change /// - The total number of steps doesn't change pub struct AtomicSubStep { - name: Name, + unit_name: Name, current: Arc, total: u32, } @@ -73,13 +73,13 @@ pub struct AtomicSubStep { impl AtomicSubStep { pub fn new(total: u32) -> (Arc, Self) { let current = Arc::new(AtomicU32::new(0)); - (current.clone(), Self { current, total, name: Name::default() }) + (current.clone(), Self { current, total, unit_name: Name::default() }) } } impl Step for AtomicSubStep { fn name(&self) -> Cow<'static, str> { - self.name.name().into() + self.unit_name.name().into() } fn current(&self) -> u32 { From 85577e70cd47f39a2b891d96f96cc3467ce6d1ae Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:05:34 +0100 Subject: [PATCH 140/158] reuse the enqueued --- crates/index-scheduler/src/batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 733984043..d05af31c3 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -695,7 +695,7 @@ impl IndexScheduler { let (atomic, update_file_progress) = AtomicUpdateFileStep::new(enqueued.len() as u32); progress.update_progress(update_file_progress); - for task_id in self.get_status(&rtxn, Status::Enqueued)? { + for task_id in enqueued { let task = self.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; if let Some(content_uuid) = task.content_uuid() { let src = self.file_store.get_update_path(content_uuid); From f1beb60204e32800c00c47eb86e23fee2082edd8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:07:45 +0100 Subject: [PATCH 141/158] make the progress use payload instead of documents --- crates/milli/src/progress.rs | 1 + crates/milli/src/update/new/indexer/document_operation.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 6a4231e91..8243ec235 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -138,6 +138,7 @@ macro_rules! make_atomic_progress { } make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); #[derive(Debug, Serialize, Clone)] pub struct ProgressView { diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 4418944db..a1fc31f61 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -15,7 +15,7 @@ use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; -use crate::progress::{AtomicDocumentStep, Progress}; +use crate::progress::{AtomicDocumentStep, AtomicPayloadStep, Progress}; use crate::update::new::document::Versions; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; @@ -71,7 +71,7 @@ impl<'pl> DocumentOperation<'pl> { let mut primary_key = None; let payload_count = operations.len(); - let (step, progress_step) = AtomicDocumentStep::new(payload_count as u32); + let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32); progress.update_progress(progress_step); for (payload_index, operation) in operations.into_iter().enumerate() { From 5d682b4700789bdb91215d83622aa3e1d4c062c1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:08:45 +0100 Subject: [PATCH 142/158] rename the ComputingTheChanges to ComputingDocumentChanges --- crates/index-scheduler/src/batch.rs | 2 +- crates/index-scheduler/src/processing.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index d05af31c3..9ad43f192 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1359,7 +1359,7 @@ impl IndexScheduler { } }; - progress.update_progress(DocumentOperationProgress::ComputingTheChanges); + progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges); let (document_changes, operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 5212433ef..89bec97e9 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -139,7 +139,7 @@ make_enum_progress! { make_enum_progress! { pub enum DocumentOperationProgress { RetrievingConfig, - ComputingTheChanges, + ComputingDocumentChanges, Indexing, } } From ad4dc7072028d4361d77ac682dfa61afc80a20f7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:09:54 +0100 Subject: [PATCH 143/158] rename the ComputingTheChanges to ComputingDocumentChanges in the edit document progress --- crates/index-scheduler/src/batch.rs | 2 +- crates/index-scheduler/src/processing.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 9ad43f192..a40eac02c 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1509,7 +1509,7 @@ impl IndexScheduler { }; let candidates_count = candidates.len(); - progress.update_progress(DocumentEditionProgress::ComputingTheChanges); + progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges); let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); let document_changes = pool .install(|| { diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 89bec97e9..57d90a40b 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -147,7 +147,7 @@ make_enum_progress! { make_enum_progress! { pub enum DocumentEditionProgress { RetrievingConfig, - ComputingTheChanges, + ComputingDocumentChanges, Indexing, } } From 29fc77ee5b81c9768d5da72ffa228384c79fe545 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:11:19 +0100 Subject: [PATCH 144/158] remove usuless print --- crates/index-scheduler/src/utils.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 3718c69ca..1fcedfddf 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -276,7 +276,6 @@ impl IndexScheduler { .map(|batch_id| { if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) { let mut batch = processing.batch.as_ref().unwrap().to_batch(); - println!("here with progress: {}", processing.progress.is_some()); batch.progress = processing.get_progress_view(); Ok(batch) } else { From fa885e75b42a2312c6efcdcd315e61daf5bb622f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:13:12 +0100 Subject: [PATCH 145/158] rename the send_progress in progress --- crates/milli/src/update/new/indexer/document_changes.rs | 6 +++--- crates/milli/src/update/new/indexer/mod.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 3e2b9c036..a45fcee85 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -206,7 +206,7 @@ pub fn extract< doc_allocs, fields_ids_map_store, must_stop_processing, - progress: send_progress, + progress, }: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, @@ -217,7 +217,7 @@ where MSP: Fn() -> bool + Sync, { tracing::trace!("We are resetting the extractor allocators"); - send_progress.update_progress(step); + progress.update_progress(step); // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); @@ -226,7 +226,7 @@ where let total_documents = document_changes.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); - send_progress.update_progress(progress_step); + progress.update_progress(progress_step); let pi = document_changes.iter(CHUNK_SIZE); pi.try_arc_for_each_try_init( diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index acdf78304..a850c0d03 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -72,7 +72,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( document_changes: &DC, embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, - send_progress: &'indexer Progress, + progress: &'indexer Progress, ) -> Result<()> where DC: DocumentChanges<'pl>, @@ -125,7 +125,7 @@ where doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing, - progress: send_progress, + progress, }; let mut index_embeddings = index.embedding_configs(wtxn)?; From 45d5d4bf40450b9010ef2b935393c60b0068c4e0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:15:33 +0100 Subject: [PATCH 146/158] make the progressview public --- crates/milli/src/progress.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 8243ec235..3c7a35c89 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -141,14 +141,16 @@ make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); #[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] pub struct ProgressView { - steps: Vec, - percentage: f32, + pub steps: Vec, + pub percentage: f32, } #[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] pub struct ProgressStepView { - name: Cow<'static, str>, - finished: u32, - total: u32, + pub name: Cow<'static, str>, + pub finished: u32, + pub total: u32, } From ab9213fa942b1037478fd3257b618faee01d22bc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:16:20 +0100 Subject: [PATCH 147/158] ensure we never write the progress to the db --- crates/meilisearch-types/src/batches.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 57c609320..34af21f60 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -16,7 +16,7 @@ pub type BatchId = u32; pub struct Batch { pub uid: BatchId, - #[serde(skip_deserializing)] + #[serde(skip)] pub progress: Option, pub details: DetailsView, pub stats: BatchStats, From 75d5cea62470c17a3341c40f0eeefbcf81590f9d Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:17:33 +0100 Subject: [PATCH 148/158] use a with_capacity while allocating the progress view --- crates/milli/src/progress.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 3c7a35c89..96483ebd0 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -37,7 +37,7 @@ impl Progress { let mut percentage = 0.0; let mut prev_factors = 1.0; - let mut step_view = Vec::new(); + let mut step_view = Vec::with_capacity(steps.len()); for (_, step) in steps.iter() { prev_factors *= step.total() as f32; percentage += step.current() as f32 / prev_factors; From 08fd026ebdc8638ff283e8b301346f6d92219530 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:18:13 +0100 Subject: [PATCH 149/158] fix warning --- crates/milli/src/update/new/indexer/document_operation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index a1fc31f61..090c1eb8e 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -15,7 +15,7 @@ use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; -use crate::progress::{AtomicDocumentStep, AtomicPayloadStep, Progress}; +use crate::progress::{AtomicPayloadStep, Progress}; use crate::update::new::document::Versions; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; From 8cd3a1aa571f7a1489dc84ffdad6ce790279cba1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:18:40 +0100 Subject: [PATCH 150/158] fmt --- crates/index-scheduler/src/error.rs | 3 ++- .../index-scheduler/src/index_mapper/mod.rs | 9 ++++---- crates/index-scheduler/src/processing.rs | 13 ++++++----- crates/meilisearch-types/src/batch_view.rs | 8 +++---- crates/meilisearch-types/src/batches.rs | 6 ++--- crates/meilisearch/src/routes/batches.rs | 22 +++++++++---------- crates/meilitool/src/upgrade/mod.rs | 1 - crates/meilitool/src/upgrade/v1_10.rs | 17 +++++--------- crates/meilitool/src/upgrade/v1_11.rs | 10 ++++----- crates/meilitool/src/upgrade/v1_12.rs | 3 ++- crates/milli/src/progress.rs | 12 ++++------ 11 files changed, 47 insertions(+), 57 deletions(-) diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 5fb04828c..69da70a7e 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -1,12 +1,13 @@ use std::fmt::Display; -use crate::TaskId; use meilisearch_types::batches::BatchId; use meilisearch_types::error::{Code, ErrorCode}; use meilisearch_types::tasks::{Kind, Status}; use meilisearch_types::{heed, milli}; use thiserror::Error; +use crate::TaskId; + #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DateField { BeforeEnqueuedAt, diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 8b9ef3597..2f5b176ed 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -3,10 +3,6 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use std::{fs, thread}; -use self::index_map::IndexMap; -use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; -use crate::uuid_codec::UuidCodec; -use crate::{Error, Result}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli; @@ -17,6 +13,11 @@ use time::OffsetDateTime; use tracing::error; use uuid::Uuid; +use self::index_map::IndexMap; +use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; +use crate::uuid_codec::UuidCodec; +use crate::{Error, Result}; + mod index_map; const INDEX_MAPPING: &str = "index-mapping"; diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 57d90a40b..74802831e 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -1,11 +1,12 @@ -use crate::utils::ProcessingBatch; +use std::borrow::Cow; +use std::sync::Arc; + use enum_iterator::Sequence; -use meilisearch_types::milli::{ - make_atomic_progress, make_enum_progress, - progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}, -}; +use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}; +use meilisearch_types::milli::{make_atomic_progress, make_enum_progress}; use roaring::RoaringBitmap; -use std::{borrow::Cow, sync::Arc}; + +use crate::utils::ProcessingBatch; #[derive(Clone)] pub struct ProcessingTasks { diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index a3d7f834f..08d25413c 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -2,11 +2,9 @@ use milli::progress::ProgressView; use serde::Serialize; use time::{Duration, OffsetDateTime}; -use crate::{ - batches::{Batch, BatchId, BatchStats}, - task_view::DetailsView, - tasks::serialize_duration, -}; +use crate::batches::{Batch, BatchId, BatchStats}; +use crate::task_view::DetailsView; +use crate::tasks::serialize_duration; #[derive(Debug, Clone, Serialize)] #[serde(rename_all = "camelCase")] diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 34af21f60..664dafa7a 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -4,10 +4,8 @@ use milli::progress::ProgressView; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; -use crate::{ - task_view::DetailsView, - tasks::{Kind, Status}, -}; +use crate::task_view::DetailsView; +use crate::tasks::{Kind, Status}; pub type BatchId = u32; diff --git a/crates/meilisearch/src/routes/batches.rs b/crates/meilisearch/src/routes/batches.rs index 6faedc021..4d42cdd16 100644 --- a/crates/meilisearch/src/routes/batches.rs +++ b/crates/meilisearch/src/routes/batches.rs @@ -1,18 +1,18 @@ -use actix_web::{ - web::{self, Data}, - HttpResponse, -}; +use actix_web::web::{self, Data}; +use actix_web::HttpResponse; use deserr::actix_web::AwebQueryParameter; use index_scheduler::{IndexScheduler, Query}; -use meilisearch_types::{ - batch_view::BatchView, batches::BatchId, deserr::DeserrQueryParamError, error::ResponseError, - keys::actions, -}; +use meilisearch_types::batch_view::BatchView; +use meilisearch_types::batches::BatchId; +use meilisearch_types::deserr::DeserrQueryParamError; +use meilisearch_types::error::ResponseError; +use meilisearch_types::keys::actions; use serde::Serialize; -use crate::extractors::{authentication::GuardedData, sequential_extractor::SeqHandler}; - -use super::{tasks::TasksFilterQuery, ActionPolicy}; +use super::tasks::TasksFilterQuery; +use super::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::extractors::sequential_extractor::SeqHandler; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::get().to(SeqHandler(get_batches)))) diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 50882f610..14f941311 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -7,7 +7,6 @@ use std::path::{Path, PathBuf}; use anyhow::{bail, Context}; use meilisearch_types::versioning::create_version_file; - use v1_10::v1_9_to_v1_10; use v1_12::v1_11_to_v1_12; diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs index 2efc1773c..4a49ea471 100644 --- a/crates/meilitool/src/upgrade/v1_10.rs +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -1,18 +1,13 @@ -use anyhow::bail; use std::path::Path; -use anyhow::Context; -use meilisearch_types::{ - heed::{ - types::{SerdeJson, Str}, - Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, - }, - milli::index::{db_name, main_key}, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use anyhow::{bail, Context}; +use meilisearch_types::heed::types::{SerdeJson, Str}; +use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; +use meilisearch_types::milli::index::{db_name, main_key}; use super::v1_9; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; pub type FieldDistribution = std::collections::BTreeMap; diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index 0c84d3842..92d853dd0 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -7,12 +7,12 @@ use std::path::Path; use anyhow::Context; -use meilisearch_types::{ - heed::{types::Str, Database, EnvOpenOptions}, - milli::index::db_name, -}; +use meilisearch_types::heed::types::Str; +use meilisearch_types::heed::{Database, EnvOpenOptions}; +use meilisearch_types::milli::index::db_name; -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { println!("Upgrading from v1.10.0 to v1.11.0"); diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs index 85fb41472..444617375 100644 --- a/crates/meilitool/src/upgrade/v1_12.rs +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -1,7 +1,8 @@ //! The breaking changes that happened between the v1.11 and the v1.12 are: //! - The new indexer changed the update files format from OBKV to ndjson. https://github.com/meilisearch/meilisearch/pull/4900 -use std::{io::BufWriter, path::Path}; +use std::io::BufWriter; +use std::path::Path; use anyhow::Context; use file_store::FileStore; diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 96483ebd0..d50be43cb 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -1,11 +1,7 @@ -use std::{ - any::TypeId, - borrow::Cow, - sync::{ - atomic::{AtomicU32, Ordering}, - Arc, RwLock, - }, -}; +use std::any::TypeId; +use std::borrow::Cow; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; use serde::Serialize; From d12364c1e0a22246652db2497f813031769adb76 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:30:48 +0100 Subject: [PATCH 151/158] fix the tests --- crates/index-scheduler/src/lib.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index d3e65c6f8..f5f73087d 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4308,16 +4308,6 @@ mod tests { snapshot!(batch, @r#" { "uid": 0, - "progress": { - "steps": [ - { - "name": "processing tasks", - "finished": 0, - "total": 2 - } - ], - "percentage": 0.0 - }, "details": { "primaryKey": "mouse" }, From 0d0c18f519e44ab30d1b4d91dc2cd0f0b63d9275 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 11 Dec 2024 18:41:03 +0100 Subject: [PATCH 152/158] rename the Step::name into Step::current_step --- crates/index-scheduler/src/processing.rs | 18 +++++++++--------- crates/milli/src/progress.rs | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 74802831e..aca654de9 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -215,7 +215,7 @@ mod test { { "steps": [ { - "name": "processing tasks", + "currentStep": "processing tasks", "finished": 0, "total": 2 } @@ -228,7 +228,7 @@ mod test { { "steps": [ { - "name": "writing tasks to disk", + "currentStep": "writing tasks to disk", "finished": 1, "total": 2 } @@ -248,12 +248,12 @@ mod test { { "steps": [ { - "name": "processing tasks", + "currentStep": "processing tasks", "finished": 0, "total": 2 }, { - "name": "task", + "currentStep": "task", "finished": 0, "total": 10 } @@ -266,12 +266,12 @@ mod test { { "steps": [ { - "name": "processing tasks", + "currentStep": "processing tasks", "finished": 0, "total": 2 }, { - "name": "task", + "currentStep": "task", "finished": 6, "total": 10 } @@ -284,7 +284,7 @@ mod test { { "steps": [ { - "name": "writing tasks to disk", + "currentStep": "writing tasks to disk", "finished": 1, "total": 2 } @@ -299,12 +299,12 @@ mod test { { "steps": [ { - "name": "writing tasks to disk", + "currentStep": "writing tasks to disk", "finished": 1, "total": 2 }, { - "name": "task", + "currentStep": "task", "finished": 4, "total": 5 } diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index d50be43cb..accc2cf56 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -39,7 +39,7 @@ impl Progress { percentage += step.current() as f32 / prev_factors; step_view.push(ProgressStepView { - name: step.name(), + current_step: step.name(), finished: step.current(), total: step.total(), }); @@ -146,7 +146,7 @@ pub struct ProgressView { #[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub struct ProgressStepView { - pub name: Cow<'static, str>, + pub current_step: Cow<'static, str>, pub finished: u32, pub total: u32, } From 1fdfa3f20885abe5d6dcd95eda0f7e4b2678cdd1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 12 Dec 2024 09:26:14 +0100 Subject: [PATCH 153/158] Change the exit code to 130 when Ctrl-Ced --- crates/meilisearch/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index 6e6245c78..ee3bbf430 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -131,7 +131,7 @@ async fn try_main() -> anyhow::Result<()> { tokio::spawn(async move { tokio::signal::ctrl_c().await.unwrap(); - std::process::exit(77); + std::process::exit(130); }); run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; From 6c72559457366da88acf191e1844cb1d353b5127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Dec 2024 09:27:10 +0100 Subject: [PATCH 154/158] Update the binary-path description Co-authored-by: Louis Dureuil --- crates/xtask/src/bench/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index 491dc33ab..1416c21d9 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -87,7 +87,9 @@ pub struct BenchDeriveArgs { #[arg(long, default_value_t = 60)] tasks_queue_timeout_secs: u64, - /// The path to the binary to run. By default it compiles the binary with cargo. + /// The path to the binary to run. + /// + /// If unspecified, runs `cargo run` after building Meilisearch with `cargo build`. #[arg(long)] binary_path: Option, } From 18ce95dcbf5c8c8ae7527887ae2abf3cb2b1c7a7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Dec 2024 14:56:45 +0100 Subject: [PATCH 155/158] Add test reproducing the bug --- .../meilisearch/tests/search/facet_search.rs | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 19224c3df..696c23f91 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -57,6 +57,116 @@ async fn simple_facet_search() { assert_eq!(response["facetHits"].as_array().unwrap().len(), 1); } +#[actix_rt::test] +async fn simple_facet_search_on_movies() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { + "id": 1, + "title": "Carol", + "genres": [ + "Romance", + "Drama" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + }, + { + "id": 2, + "title": "Wonder Woman", + "genres": [ + "Action", + "Adventure" + ], + "color": [ + "green" + ], + "platforms": [ + "MacOS" + ] + }, + { + "id": 3, + "title": "Life of Pi", + "genres": [ + "Adventure", + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 4, + "title": "Mad Max: Fury Road", + "genres": [ + "Adventure", + "Science Fiction" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux" + ] + }, + { + "id": 5, + "title": "Moana", + "genres": [ + "Fantasy", + "Action" + ], + "color": [ + "red" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 6, + "title": "Philadelphia", + "genres": [ + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + } + ]); + let (response, code) = + index.update_settings_filterable_attributes(json!(["genres", "color"])).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetQuery": "", "facetName": "genres", "q": "" })).await; + + assert_eq!(code, 200, "{}", response); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":2},{"value":"Adventure","count":3},{"value":"Drama","count":3},{"value":"Fantasy","count":1},{"value":"Romance","count":1},{"value":"Science Fiction","count":1}]"###); +} + #[actix_rt::test] async fn advanced_facet_search() { let server = Server::new().await; From 961de4d34ea3821ba24df8c376b6e6cf0d5a307a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Dec 2024 14:56:59 +0100 Subject: [PATCH 156/158] Fix facet fst --- .../milli/src/update/new/facet_search_builder.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 1993c1d00..d1ff6096d 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -103,6 +103,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> { + tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets); + let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); builder.extend(reader); @@ -118,12 +120,15 @@ impl<'indexer> FacetSearchBuilder<'indexer> { BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?; if current_field_id != Some(field_id) { - if let Some(fst_merger_builder) = fst_merger_builder { + if let (Some(current_field_id), Some(fst_merger_builder)) = + (current_field_id, fst_merger_builder) + { let mmap = fst_merger_builder.build(&mut callback)?; - index - .facet_id_string_fst - .remap_data_type::() - .put(wtxn, &field_id, &mmap)?; + index.facet_id_string_fst.remap_data_type::().put( + wtxn, + ¤t_field_id, + &mmap, + )?; } fst = index.facet_id_string_fst.get(rtxn, &field_id)?; From 2f3cc8cdd2505fc9ba9b6bc435ca822101a54542 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 12 Dec 2024 16:15:37 +0100 Subject: [PATCH 157/158] Fix the merge_caches_sorted function --- crates/milli/src/update/new/extract/cache.rs | 57 +++++++++----------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 09ca60211..62c00d2b1 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -477,21 +477,16 @@ where F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, { let mut maps = Vec::new(); - let mut readers = Vec::new(); - let mut current_bucket = None; - for FrozenCache { bucket, cache, ref mut spilled } in frozen { - assert_eq!(*current_bucket.get_or_insert(bucket), bucket); - maps.push(cache); - readers.append(spilled); - } - - // First manage the spilled entries by looking into the HashMaps, - // merge them and mark them as dummy. let mut heap = BinaryHeap::new(); - for (source_index, source) in readers.into_iter().enumerate() { - let mut cursor = source.into_cursor()?; - if cursor.move_on_next()?.is_some() { - heap.push(Entry { cursor, source_index }); + let mut current_bucket = None; + for FrozenCache { bucket, cache, spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket), bucket); + maps.push((bucket, cache)); + for reader in spilled { + let mut cursor = reader.into_cursor()?; + if cursor.move_on_next()?.is_some() { + heap.push(Entry { cursor, bucket }); + } } } @@ -508,25 +503,25 @@ where let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; while let Some(mut entry) = heap.peek_mut() { - if let Some((key, _value)) = entry.cursor.current() { - if first_key == key { - let new = DelAddRoaringBitmap::from_bytes(first_value)?; - output = output.merge(new); - // When we are done we the current value of this entry move make - // it move forward and let the heap reorganize itself (on drop) - if entry.cursor.move_on_next()?.is_none() { - PeekMut::pop(entry); - } - } else { + if let Some((key, value)) = entry.cursor.current() { + if first_key != key { break; } + + let new = DelAddRoaringBitmap::from_bytes(value)?; + output = output.merge(new); + // When we are done we the current value of this entry move make + // it move forward and let the heap reorganize itself (on drop) + if entry.cursor.move_on_next()?.is_none() { + PeekMut::pop(entry); + } } } // Once we merged all of the spilled bitmaps we must also // fetch the entries from the non-spilled entries (the HashMaps). - for (map_index, map) in maps.iter_mut().enumerate() { - if first_entry.source_index != map_index { + for (map_bucket, map) in maps.iter_mut() { + if first_entry.bucket != *map_bucket { if let Some(new) = map.get_mut(first_key) { output.union_and_clear_bbbul(new); } @@ -538,12 +533,12 @@ where // Don't forget to put the first entry back into the heap. if first_entry.cursor.move_on_next()?.is_some() { - heap.push(first_entry) + heap.push(first_entry); } } // Then manage the content on the HashMap entries that weren't taken (mem::take). - while let Some(mut map) = maps.pop() { + while let Some((_, mut map)) = maps.pop() { // Make sure we don't try to work with entries already managed by the spilled let mut ordered_entries: Vec<_> = map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect(); @@ -553,7 +548,7 @@ where let mut output = DelAddRoaringBitmap::empty(); output.union_and_clear_bbbul(bbbul); - for rhs in maps.iter_mut() { + for (_, rhs) in maps.iter_mut() { if let Some(new) = rhs.get_mut(key) { output.union_and_clear_bbbul(new); } @@ -569,14 +564,14 @@ where struct Entry { cursor: ReaderCursor, - source_index: usize, + bucket: usize, } impl Ord for Entry { fn cmp(&self, other: &Entry) -> Ordering { let skey = self.cursor.current().map(|(k, _)| k); let okey = other.cursor.current().map(|(k, _)| k); - skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse() + skey.cmp(&okey).then(self.bucket.cmp(&other.bucket)).reverse() } } From acdd5aa6ea143b2b92079e50cc0e22afeebee570 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Dec 2024 17:54:28 +0100 Subject: [PATCH 158/158] Use the thread source id instead of the destination id when filtering on the cache to merge --- crates/milli/src/update/new/extract/cache.rs | 44 ++++++++++++-------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 62c00d2b1..e2c8bb5fe 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> { Ok(()) } - pub fn freeze(&mut self) -> Result>> { + pub fn freeze(&mut self, source_id: usize) -> Result>> { match &mut self.caches { InnerCaches::Normal(NormalCaches { caches }) => caches .iter_mut() .enumerate() - .map(|(bucket, map)| { + .map(|(bucket_id, map)| { // safety: we are transmuting the Bbbul into a FrozenBbbul // that are the same size. let map = unsafe { @@ -201,14 +201,19 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) + Ok(FrozenCache { + source_id, + bucket_id, + cache: FrozenMap::new(map), + spilled: Vec::new(), + }) }) .collect(), InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches .iter_mut() .zip(mem::take(spilled_entries)) .enumerate() - .map(|(bucket, (map, sorter))| { + .map(|(bucket_id, (map, sorter))| { let spilled = sorter .into_reader_cursors()? .into_iter() @@ -234,7 +239,7 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) + Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled }) }) .collect(), } @@ -440,7 +445,8 @@ fn spill_entry_to_sorter( } pub struct FrozenCache<'a, 'extractor> { - bucket: usize, + bucket_id: usize, + source_id: usize, cache: FrozenMap< 'a, 'extractor, @@ -457,9 +463,9 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>( let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0); let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect(); - for thread_cache in caches { - for frozen in thread_cache.freeze()? { - bucket_caches[frozen.bucket].push(frozen); + for (thread_index, thread_cache) in caches.iter_mut().enumerate() { + for frozen in thread_cache.freeze(thread_index)? { + bucket_caches[frozen.bucket_id].push(frozen); } } @@ -479,13 +485,13 @@ where let mut maps = Vec::new(); let mut heap = BinaryHeap::new(); let mut current_bucket = None; - for FrozenCache { bucket, cache, spilled } in frozen { - assert_eq!(*current_bucket.get_or_insert(bucket), bucket); - maps.push((bucket, cache)); + for FrozenCache { source_id, bucket_id, cache, spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id); + maps.push((source_id, cache)); for reader in spilled { let mut cursor = reader.into_cursor()?; if cursor.move_on_next()?.is_some() { - heap.push(Entry { cursor, bucket }); + heap.push(Entry { cursor, source_id }); } } } @@ -520,8 +526,12 @@ where // Once we merged all of the spilled bitmaps we must also // fetch the entries from the non-spilled entries (the HashMaps). - for (map_bucket, map) in maps.iter_mut() { - if first_entry.bucket != *map_bucket { + for (source_id, map) in maps.iter_mut() { + debug_assert!( + !(map.get(first_key).is_some() && first_entry.source_id == *source_id), + "A thread should not have spiled a key that has been inserted in the cache" + ); + if first_entry.source_id != *source_id { if let Some(new) = map.get_mut(first_key) { output.union_and_clear_bbbul(new); } @@ -564,14 +574,14 @@ where struct Entry { cursor: ReaderCursor, - bucket: usize, + source_id: usize, } impl Ord for Entry { fn cmp(&self, other: &Entry) -> Ordering { let skey = self.cursor.current().map(|(k, _)| k); let okey = other.cursor.current().map(|(k, _)| k); - skey.cmp(&okey).then(self.bucket.cmp(&other.bucket)).reverse() + skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse() } }